From 6859a8402945cf1d74af75a2e1aa4e327a506ab4 Mon Sep 17 00:00:00 2001 From: Alan Mayer Date: Wed, 26 Mar 2008 16:11:31 -0500 Subject: x86: resize NR_IRQS for large machines On machines with very large numbers of cpus, tables that are dimensioned by NR_IRQS get very large, especially the irq_desc table. They are also very sparsely used. When the cpu count is > MAX_IO_APICS, use MAX_IO_APICS to set NR_IRQS, otherwise use NR_CPUS. Signed-off-by: Alan Mayer Reviewed-by: Christoph Lameter Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/kernel_stat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index e8ffce898bf..cf9f40a91c9 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -1,11 +1,11 @@ #ifndef _LINUX_KERNEL_STAT_H #define _LINUX_KERNEL_STAT_H -#include #include #include #include #include +#include #include /* -- cgit v1.2.3-70-g09d2 From 988f7b5789ccf5cfed14c72e28573a49f0cb4809 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Tue, 18 Mar 2008 17:00:22 -0700 Subject: x86: PAT export resource_wc in pci sysfs For the ranges with IORESOURCE_PREFETCH, export a new resource_wc interface in pci /sysfs along with resource (which is uncached). Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Acked-by: Jesse Barnes Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- Documentation/filesystems/sysfs-pci.txt | 1 + drivers/pci/pci-sysfs.c | 84 ++++++++++++++++++++++++--------- include/linux/pci.h | 1 + 3 files changed, 64 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 5daa2aaec2c..68ef48839c0 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt @@ -36,6 +36,7 @@ files, each with their own function. local_cpus nearby CPU mask (cpumask, ro) resource PCI resource host addresses (ascii, ro) resource0..N PCI resource N, if present (binary, mmap) + resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) rom PCI ROM resource, if present (binary, ro) subsystem_device PCI subsystem device (ascii, ro) subsystem_vendor PCI subsystem vendor (ascii, ro) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 271d41cc05a..9ec7d3977a8 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -489,13 +489,14 @@ pci_mmap_legacy_mem(struct kobject *kobj, struct bin_attribute *attr, * @kobj: kobject for mapping * @attr: struct bin_attribute for the file being mapped * @vma: struct vm_area_struct passed into the mmap + * @write_combine: 1 for write_combine mapping * * Use the regular PCI mapping routines to map a PCI resource into userspace. * FIXME: write combining? maybe automatic for prefetchable regions? */ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, - struct vm_area_struct *vma) + struct vm_area_struct *vma, int write_combine) { struct pci_dev *pdev = to_pci_dev(container_of(kobj, struct device, kobj)); @@ -518,7 +519,21 @@ pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, vma->vm_pgoff += start >> PAGE_SHIFT; mmap_type = res->flags & IORESOURCE_MEM ? pci_mmap_mem : pci_mmap_io; - return pci_mmap_page_range(pdev, vma, mmap_type, 0); + return pci_mmap_page_range(pdev, vma, mmap_type, write_combine); +} + +static int +pci_mmap_resource_uc(struct kobject *kobj, struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + return pci_mmap_resource(kobj, attr, vma, 0); +} + +static int +pci_mmap_resource_wc(struct kobject *kobj, struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + return pci_mmap_resource(kobj, attr, vma, 1); } /** @@ -541,9 +556,46 @@ pci_remove_resource_files(struct pci_dev *pdev) sysfs_remove_bin_file(&pdev->dev.kobj, res_attr); kfree(res_attr); } + + res_attr = pdev->res_attr_wc[i]; + if (res_attr) { + sysfs_remove_bin_file(&pdev->dev.kobj, res_attr); + kfree(res_attr); + } } } +static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) +{ + /* allocate attribute structure, piggyback attribute name */ + int name_len = write_combine ? 13 : 10; + struct bin_attribute *res_attr; + int retval; + + res_attr = kzalloc(sizeof(*res_attr) + name_len, GFP_ATOMIC); + if (res_attr) { + char *res_attr_name = (char *)(res_attr + 1); + + if (write_combine) { + pdev->res_attr_wc[num] = res_attr; + sprintf(res_attr_name, "resource%d_wc", num); + res_attr->mmap = pci_mmap_resource_wc; + } else { + pdev->res_attr[num] = res_attr; + sprintf(res_attr_name, "resource%d", num); + res_attr->mmap = pci_mmap_resource_uc; + } + res_attr->attr.name = res_attr_name; + res_attr->attr.mode = S_IRUSR | S_IWUSR; + res_attr->size = pci_resource_len(pdev, num); + res_attr->private = &pdev->resource[num]; + retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr); + } else + retval = -ENOMEM; + + return retval; +} + /** * pci_create_resource_files - create resource files in sysfs for @dev * @dev: dev in question @@ -557,31 +609,19 @@ static int pci_create_resource_files(struct pci_dev *pdev) /* Expose the PCI resources from this device as files */ for (i = 0; i < PCI_ROM_RESOURCE; i++) { - struct bin_attribute *res_attr; /* skip empty resources */ if (!pci_resource_len(pdev, i)) continue; - /* allocate attribute structure, piggyback attribute name */ - res_attr = kzalloc(sizeof(*res_attr) + 10, GFP_ATOMIC); - if (res_attr) { - char *res_attr_name = (char *)(res_attr + 1); - - pdev->res_attr[i] = res_attr; - sprintf(res_attr_name, "resource%d", i); - res_attr->attr.name = res_attr_name; - res_attr->attr.mode = S_IRUSR | S_IWUSR; - res_attr->size = pci_resource_len(pdev, i); - res_attr->mmap = pci_mmap_resource; - res_attr->private = &pdev->resource[i]; - retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr); - if (retval) { - pci_remove_resource_files(pdev); - return retval; - } - } else { - return -ENOMEM; + retval = pci_create_attr(pdev, i, 0); + /* for prefetchable resources, create a WC mappable file */ + if (!retval && pdev->resource[i].flags & IORESOURCE_PREFETCH) + retval = pci_create_attr(pdev, i, 1); + + if (retval) { + pci_remove_resource_files(pdev); + return retval; } } return 0; diff --git a/include/linux/pci.h b/include/linux/pci.h index 509159bcd4e..d18b1dd49fa 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -206,6 +206,7 @@ struct pci_dev { struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */ int rom_attr_enabled; /* has display of the rom attribute been enabled? */ struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */ + struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */ #ifdef CONFIG_PCI_MSI struct list_head msi_list; #endif -- cgit v1.2.3-70-g09d2 From 1a189b97190d3f0f8cf0379a799d3555b2d648bb Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 13 Apr 2008 21:41:55 +0100 Subject: [ARM] pxa: Add bare bones PWM API Signed-off-by: Russell King --- arch/arm/Kconfig | 3 +++ include/linux/pwm.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 include/linux/pwm.h (limited to 'include/linux') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b786e68914d..c274dbb89a8 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -22,6 +22,9 @@ config ARM Europe. There is an ARM Linux project with a web page at . +config HAVE_PWM + bool + config SYS_SUPPORTS_APM_EMULATION bool diff --git a/include/linux/pwm.h b/include/linux/pwm.h new file mode 100644 index 00000000000..3945f803d51 --- /dev/null +++ b/include/linux/pwm.h @@ -0,0 +1,31 @@ +#ifndef __LINUX_PWM_H +#define __LINUX_PWM_H + +struct pwm_device; + +/* + * pwm_request - request a PWM device + */ +struct pwm_device *pwm_request(int pwm_id, const char *label); + +/* + * pwm_free - free a PWM device + */ +void pwm_free(struct pwm_device *pwm); + +/* + * pwm_config - change a PWM device configuration + */ +int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns); + +/* + * pwm_enable - start a PWM output toggling + */ +int pwm_enable(struct pwm_device *pwm); + +/* + * pwm_disable - stop a PWM output toggling + */ +void pwm_disable(struct pwm_device *pwm); + +#endif /* __ASM_ARCH_PWM_H */ -- cgit v1.2.3-70-g09d2 From bd3bff9e20f454b242d979ec2f9a4dca0d5fa06f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:41 +0200 Subject: sched: add latency tracer callbacks to the scheduler add 3 lightweight callbacks to the tracer backend. zero impact if tracing is turned off. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 26 ++++++++++++++++++++++++++ kernel/sched.c | 3 +++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5395a6176f4..717cab8a0c8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2117,6 +2117,32 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern void +ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next); +#else +static inline void +ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) +{ +} +#endif + +#ifdef CONFIG_SCHED_TRACER +extern void +ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr); +extern void +ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr); +#else +static inline void +ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +{ +} +static inline void +ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) +{ +} +#endif + extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); extern long sched_getaffinity(pid_t pid, cpumask_t *mask); diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a9153..463dcdb36ef 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2467,6 +2467,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ + ftrace_wake_up_task(p, rq->curr); schedstat_inc(p, se.nr_wakeups); if (sync) schedstat_inc(p, se.nr_wakeups_sync); @@ -2611,6 +2612,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } + ftrace_wake_up_new_task(p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2783,6 +2785,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); + ftrace_ctx_switch(prev, next); mm = next->mm; oldmm = prev->active_mm; /* -- cgit v1.2.3-70-g09d2 From 7c731e0a495e25e79dc1e9e68772a67a55721a65 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:41 +0200 Subject: ftrace: make the task state char-string visible to all The tracer wants to be able to convert the state number into a user visible character. This patch pulls that conversion string out the scheduler into the header. This way if it were to ever change, other parts of the kernel will know. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 2 ++ kernel/sched.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 717cab8a0c8..6e26f1fdbfe 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2237,6 +2237,8 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) } #endif /* CONFIG_MM_OWNER */ +#define TASK_STATE_TO_CHAR_STR "RSDTtZX" + #endif /* __KERNEL__ */ #endif diff --git a/kernel/sched.c b/kernel/sched.c index 463dcdb36ef..73e60085236 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5729,7 +5729,7 @@ out_unlock: return retval; } -static const char stat_nam[] = "RSDTtZX"; +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; void sched_show_task(struct task_struct *p) { -- cgit v1.2.3-70-g09d2 From 502825282e6f79c975a644afc124432ec1744de4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:41 +0200 Subject: ftrace: add preempt_enable/disable notrace macros The tracer may need to call preempt_enable and disable functions for time keeping and such. The trace gets ugly when we see these functions show up for all traces. To make the output cleaner this patch adds preempt_enable_notrace and preempt_disable_notrace to be used by tracer (and debugging) functions. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/preempt.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 23f0c54175c..36b03d50bf4 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -52,6 +52,34 @@ do { \ preempt_check_resched(); \ } while (0) +/* For debugging and tracer internals only! */ +#define add_preempt_count_notrace(val) \ + do { preempt_count() += (val); } while (0) +#define sub_preempt_count_notrace(val) \ + do { preempt_count() -= (val); } while (0) +#define inc_preempt_count_notrace() add_preempt_count_notrace(1) +#define dec_preempt_count_notrace() sub_preempt_count_notrace(1) + +#define preempt_disable_notrace() \ +do { \ + inc_preempt_count_notrace(); \ + barrier(); \ +} while (0) + +#define preempt_enable_no_resched_notrace() \ +do { \ + barrier(); \ + dec_preempt_count_notrace(); \ +} while (0) + +/* preempt_check_resched is OK to trace */ +#define preempt_enable_notrace() \ +do { \ + preempt_enable_no_resched_notrace(); \ + barrier(); \ + preempt_check_resched(); \ +} while (0) + #else #define preempt_disable() do { } while (0) @@ -59,6 +87,10 @@ do { \ #define preempt_enable() do { } while (0) #define preempt_check_resched() do { } while (0) +#define preempt_disable_notrace() do { } while (0) +#define preempt_enable_no_resched_notrace() do { } while (0) +#define preempt_enable_notrace() do { } while (0) + #endif #ifdef CONFIG_PREEMPT_NOTIFIERS -- cgit v1.2.3-70-g09d2 From ffdc1a09ae7e2cbd714a446ee38a27f625b5f1c8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:41 +0200 Subject: tracing: add notrace to linkage.h notrace signals that a function should not be traced. Most of the time this is used by tracers to annotate code that cannot be traced - it's in a volatile state (such as in user vdso context or NMI context) or it's in the tracer internals. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/linkage.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 2119610b24f..14f329c64ba 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -3,6 +3,8 @@ #include +#define notrace __attribute__((no_instrument_function)) + #ifdef __cplusplus #define CPP_ASMLINKAGE extern "C" #else -- cgit v1.2.3-70-g09d2 From 16444a8a40d4c7b4f6de34af0cae1f76a4f6c901 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: add basic support for gcc profiler instrumentation If CONFIG_FTRACE is selected and /proc/sys/kernel/ftrace_enabled is set to a non-zero value the ftrace routine will be called everytime we enter a kernel function that is not marked with the "notrace" attribute. The ftrace routine will then call a registered function if a function happens to be registered. [ This code has been highly hacked by Steven Rostedt and Ingo Molnar, so don't blame Arnaldo for all of this ;-) ] Update: It is now possible to register more than one ftrace function. If only one ftrace function is registered, that will be the function that ftrace calls directly. If more than one function is registered, then ftrace will call a function that will loop through the functions to call. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- Makefile | 4 ++ arch/x86/Kconfig | 1 + arch/x86/kernel/entry_32.S | 27 +++++++++ arch/x86/kernel/entry_64.S | 37 ++++++++++++ include/linux/ftrace.h | 38 +++++++++++++ kernel/Makefile | 1 + kernel/trace/Kconfig | 5 ++ kernel/trace/Makefile | 3 + kernel/trace/ftrace.c | 138 +++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 2 + 10 files changed, 256 insertions(+) create mode 100644 include/linux/ftrace.h create mode 100644 kernel/trace/Kconfig create mode 100644 kernel/trace/Makefile create mode 100644 kernel/trace/ftrace.c (limited to 'include/linux') diff --git a/Makefile b/Makefile index 20b32351906..b4a273f19b5 100644 --- a/Makefile +++ b/Makefile @@ -528,6 +528,10 @@ KBUILD_CFLAGS += -g KBUILD_AFLAGS += -gdwarf-2 endif +ifdef CONFIG_FTRACE +KBUILD_CFLAGS += -pg +endif + # We trigger additional mismatches with less inlining ifdef CONFIG_DEBUG_SECTION_MISMATCH KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fe361ae7ef2..c742dfeb0db 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,6 +23,7 @@ config X86 select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2a609dc3271..f47b9b5440d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1109,6 +1109,33 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ +#ifdef CONFIG_FTRACE +ENTRY(mcount) + cmpl $ftrace_stub, ftrace_trace_function + jnz trace + +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + + jmp ftrace_stub +END(mcount) +#endif + .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 556a8df522a..f046e0c6488 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -54,6 +54,43 @@ .code64 +#ifdef CONFIG_FTRACE +ENTRY(mcount) + cmpq $ftrace_stub, ftrace_trace_function + jnz trace +.globl ftrace_stub +ftrace_stub: + retq + +trace: + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + + call *ftrace_trace_function + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + jmp ftrace_stub +END(mcount) +#endif + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h new file mode 100644 index 00000000000..b96ef14c249 --- /dev/null +++ b/include/linux/ftrace.h @@ -0,0 +1,38 @@ +#ifndef _LINUX_FTRACE_H +#define _LINUX_FTRACE_H + +#ifdef CONFIG_FTRACE + +#include + +#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) + +typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); + +struct ftrace_ops { + ftrace_func_t func; + struct ftrace_ops *next; +}; + +/* + * The ftrace_ops must be a static and should also + * be read_mostly. These functions do modify read_mostly variables + * so use them sparely. Never free an ftrace_op or modify the + * next pointer after it has been registered. Even after unregistering + * it, the next pointer may still be used internally. + */ +int register_ftrace_function(struct ftrace_ops *ops); +int unregister_ftrace_function(struct ftrace_ops *ops); +void clear_ftrace_function(void); + +extern void ftrace_stub(unsigned long a0, unsigned long a1); +extern void mcount(void); + +#else /* !CONFIG_FTRACE */ +# define register_ftrace_function(ops) do { } while (0) +# define unregister_ftrace_function(ops) do { } while (0) +# define clear_ftrace_function(ops) do { } while (0) +#endif /* CONFIG_FTRACE */ +#endif /* _LINUX_FTRACE_H */ diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9..fa05f6d8bdb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_FTRACE) += trace/ ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig new file mode 100644 index 00000000000..8185c91417b --- /dev/null +++ b/kernel/trace/Kconfig @@ -0,0 +1,5 @@ +# +# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: +# +config HAVE_FTRACE + bool diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile new file mode 100644 index 00000000000..bf4fd215a6a --- /dev/null +++ b/kernel/trace/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_FTRACE) += libftrace.o + +libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c new file mode 100644 index 00000000000..b6a80b98a3f --- /dev/null +++ b/kernel/trace/ftrace.c @@ -0,0 +1,138 @@ +/* + * Infrastructure for profiling code inserted by 'gcc -pg'. + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2004-2008 Ingo Molnar + * + * Originally ported from the -rt patch by: + * Copyright (C) 2007 Arnaldo Carvalho de Melo + * + * Based on code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include +#include + +static DEFINE_SPINLOCK(ftrace_func_lock); +static struct ftrace_ops ftrace_list_end __read_mostly = +{ + .func = ftrace_stub, +}; + +static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; +ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; + +/* mcount is defined per arch in assembly */ +EXPORT_SYMBOL(mcount); + +notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op = ftrace_list; + + /* in case someone actually ports this to alpha! */ + read_barrier_depends(); + + while (op != &ftrace_list_end) { + /* silly alpha */ + read_barrier_depends(); + op->func(ip, parent_ip); + op = op->next; + }; +} + +/** + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. + * + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. + */ +int register_ftrace_function(struct ftrace_ops *ops) +{ + unsigned long flags; + + spin_lock_irqsave(&ftrace_func_lock, flags); + ops->next = ftrace_list; + /* + * We are entering ops into the ftrace_list but another + * CPU might be walking that list. We need to make sure + * the ops->next pointer is valid before another CPU sees + * the ops pointer included into the ftrace_list. + */ + smp_wmb(); + ftrace_list = ops; + /* + * For one func, simply call it directly. + * For more than one func, call the chain. + */ + if (ops->next == &ftrace_list_end) + ftrace_trace_function = ops->func; + else + ftrace_trace_function = ftrace_list_func; + spin_unlock_irqrestore(&ftrace_func_lock, flags); + + return 0; +} + +/** + * unregister_ftrace_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + unsigned long flags; + struct ftrace_ops **p; + int ret = 0; + + spin_lock_irqsave(&ftrace_func_lock, flags); + + /* + * If we are the only function, then the ftrace pointer is + * pointing directly to that function. + */ + if (ftrace_list == ops && ops->next == &ftrace_list_end) { + ftrace_trace_function = ftrace_stub; + ftrace_list = &ftrace_list_end; + goto out; + } + + for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) + if (*p == ops) + break; + + if (*p != ops) { + ret = -1; + goto out; + } + + *p = (*p)->next; + + /* If we only have one func left, then call that directly */ + if (ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + + out: + spin_unlock_irqrestore(&ftrace_func_lock, flags); + + return 0; +} + +/** + * clear_ftrace_function - reset the ftrace function + * + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag + */ +void clear_ftrace_function(void) +{ + ftrace_trace_function = ftrace_stub; +} diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d2099f41aa1..d8b6279a9b4 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -634,6 +634,8 @@ config LATENCYTOP Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. +source kernel/trace/Kconfig + config PROVIDE_OHCI1394_DMA_INIT bool "Remote debugging over FireWire early on boot" depends on PCI && X86 -- cgit v1.2.3-70-g09d2 From 352ad25aa4a189c667cb2af333948d34692a2d27 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: tracer for scheduler wakeup latency This patch adds the tracer that tracks the wakeup latency of the highest priority waking task. "wakeup" is added to /debugfs/tracing/available_tracers Also added to /debugfs/tracing tracing_max_latency holds the current max latency for the wakeup wakeup_thresh if set to other than zero, a log will be recorded for every wakeup that takes longer than the number entered in here (usecs for all counters) (deletes previous trace) Examples: (with ftrace_enabled = 0) ============ preemption latency trace v1.1.5 on 2.6.24-rc8 Signed-off-by: Ingo Molnar -------------------------------------------------------------------- latency: 26 us, #2/2, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: migration/0-3 (uid:0 nice:-5 policy:1 rt_prio:99) ----------------- _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / quilt-8551 0d..3 0us+: wake_up_process+0x15/0x17 (sched_exec+0xc9/0x100 ) quilt-8551 0d..4 26us : sched_switch_callback+0x73/0x81 (schedule+0x483/0x6d5 ) vim:ft=help ============ (with ftrace_enabled = 1) ============ preemption latency trace v1.1.5 on 2.6.24-rc8 -------------------------------------------------------------------- latency: 36 us, #45/45, CPU#0 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: migration/1-5 (uid:0 nice:-5 policy:1 rt_prio:99) ----------------- _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / bash-10653 1d..3 0us : wake_up_process+0x15/0x17 (sched_exec+0xc9/0x100 ) bash-10653 1d..3 1us : try_to_wake_up+0x271/0x2e7 (sub_preempt_count+0xc/0x7a ) bash-10653 1d..2 2us : try_to_wake_up+0x296/0x2e7 (update_rq_clock+0x9/0x20 ) bash-10653 1d..2 2us : update_rq_clock+0x1e/0x20 (__update_rq_clock+0xc/0x90 ) bash-10653 1d..2 3us : __update_rq_clock+0x1b/0x90 (sched_clock+0x9/0x29 ) bash-10653 1d..2 4us : try_to_wake_up+0x2a6/0x2e7 (activate_task+0xc/0x3f ) bash-10653 1d..2 4us : activate_task+0x2d/0x3f (enqueue_task+0xe/0x66 ) bash-10653 1d..2 5us : enqueue_task+0x5b/0x66 (enqueue_task_rt+0x9/0x3c ) bash-10653 1d..2 6us : try_to_wake_up+0x2ba/0x2e7 (check_preempt_wakeup+0x12/0x99 ) [...] bash-10653 1d..5 33us : tracing_record_cmdline+0xcf/0xd4 (_spin_unlock+0x9/0x33 ) bash-10653 1d..5 34us : _spin_unlock+0x19/0x33 (sub_preempt_count+0xc/0x7a ) bash-10653 1d..4 35us : wakeup_sched_switch+0x65/0x2ff (_spin_lock_irqsave+0xc/0xa9 ) bash-10653 1d..4 35us : _spin_lock_irqsave+0x19/0xa9 (add_preempt_count+0xe/0x77 ) bash-10653 1d..4 36us : sched_switch_callback+0x73/0x81 (schedule+0x483/0x6d5 ) vim:ft=help ============ The [...] was added here to not waste your email box space. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 23 ++- kernel/trace/Kconfig | 13 ++ kernel/trace/Makefile | 1 + kernel/trace/trace_sched_wakeup.c | 310 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 343 insertions(+), 4 deletions(-) create mode 100644 kernel/trace/trace_sched_wakeup.c (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b96ef14c249..db8a5e7abe4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -5,10 +5,6 @@ #include -#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) -#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) - typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); struct ftrace_ops { @@ -35,4 +31,23 @@ extern void mcount(void); # define unregister_ftrace_function(ops) do { } while (0) # define clear_ftrace_function(ops) do { } while (0) #endif /* CONFIG_FTRACE */ + + +#ifdef CONFIG_FRAME_POINTER +/* TODO: need to fix this for ARM */ +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) +# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) +# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) +# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) +#else +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 0UL +# define CALLER_ADDR2 0UL +# define CALLER_ADDR3 0UL +# define CALLER_ADDR4 0UL +# define CALLER_ADDR5 0UL +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d6aa92866c..892ecc94a82 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -4,6 +4,9 @@ config HAVE_FTRACE bool +config TRACER_MAX_TRACE + bool + config TRACING bool select DEBUG_FS @@ -23,6 +26,16 @@ config FTRACE (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. +config SCHED_TRACER + bool "Scheduling Latency Tracer" + depends on DEBUG_KERNEL + select TRACING + select CONTEXT_SWITCH_TRACER + select TRACER_MAX_TRACE + help + This tracer tracks the latency of the highest priority task + to be scheduled in, starting from the point it has woken up. + config CONTEXT_SWITCH_TRACER bool "Trace process context switches" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6b54ceb7f16..5508cdb19ae 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -3,5 +3,6 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c new file mode 100644 index 00000000000..7c3ccefcf4c --- /dev/null +++ b/kernel/trace/trace_sched_wakeup.c @@ -0,0 +1,310 @@ +/* + * trace task wakeup timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *wakeup_trace; +static int __read_mostly tracer_enabled; + +static struct task_struct *wakeup_task; +static int wakeup_cpu; +static unsigned wakeup_prio = -1; + +static DEFINE_SPINLOCK(wakeup_lock); + +static void notrace __wakeup_reset(struct trace_array *tr); + +/* + * Should this new latency be reported/recorded? + */ +static int notrace report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +void notrace +wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) +{ + unsigned long latency = 0, t0 = 0, t1 = 0; + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + cycle_t T0, T1, delta; + unsigned long flags; + long disabled; + int cpu; + + if (unlikely(!tracer_enabled)) + return; + + /* + * When we start a new trace, we set wakeup_task to NULL + * and then set tracer_enabled = 1. We want to make sure + * that another CPU does not see the tracer_enabled = 1 + * and the wakeup_task with an older task, that might + * actually be the same as next. + */ + smp_rmb(); + + if (next != wakeup_task) + return; + + /* The task we are waitng for is waking up */ + data = tr->data[wakeup_cpu]; + + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (likely(disabled != 1)) + goto out; + + spin_lock_irqsave(&wakeup_lock, flags); + + /* We could race with grabbing wakeup_lock */ + if (unlikely(!tracer_enabled || next != wakeup_task)) + goto out_unlock; + + ftrace(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = now(cpu); + delta = T1-T0; + + if (!report_latency(delta)) + goto out_unlock; + + latency = nsecs_to_usecs(delta); + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + update_max_tr(tr, wakeup_task, wakeup_cpu); + + if (tracing_thresh) { + printk(KERN_INFO "(%16s-%-5d|#%d): %lu us wakeup latency " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + wakeup_task->comm, wakeup_task->pid, + raw_smp_processor_id(), + latency, nsecs_to_usecs(tracing_thresh), t0); + } else { + printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us maximum " + "wakeup latency.\n => started at timestamp %lu: ", + wakeup_task->comm, wakeup_task->pid, + cpu, latency, t0); + } + + printk(KERN_CONT " ended at timestamp %lu: ", t1); + dump_stack(); + t1 = nsecs_to_usecs(now(cpu)); + printk(KERN_CONT " dump-end timestamp %lu\n\n", t1); + +out_unlock: + __wakeup_reset(tr); + spin_unlock_irqrestore(&wakeup_lock, flags); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +static void notrace __wakeup_reset(struct trace_array *tr) +{ + struct trace_array_cpu *data; + int cpu; + + assert_spin_locked(&wakeup_lock); + + for_each_possible_cpu(cpu) { + data = tr->data[cpu]; + tracing_reset(data); + } + + wakeup_cpu = -1; + wakeup_prio = -1; + + if (wakeup_task) + put_task_struct(wakeup_task); + + wakeup_task = NULL; +} + +static void notrace wakeup_reset(struct trace_array *tr) +{ + unsigned long flags; + + spin_lock_irqsave(&wakeup_lock, flags); + __wakeup_reset(tr); + spin_unlock_irqrestore(&wakeup_lock, flags); +} + +static notrace void +wakeup_check_start(struct trace_array *tr, struct task_struct *p, + struct task_struct *curr) +{ + int cpu = smp_processor_id(); + unsigned long flags; + long disabled; + + if (likely(!rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= curr->prio) + return; + + disabled = atomic_inc_return(&tr->data[cpu]->disabled); + if (unlikely(disabled != 1)) + goto out; + + /* interrupts should be off from try_to_wake_up */ + spin_lock(&wakeup_lock); + + /* check for races. */ + if (!tracer_enabled || p->prio >= wakeup_prio) + goto out_locked; + + /* reset the trace */ + __wakeup_reset(tr); + + wakeup_cpu = task_cpu(p); + wakeup_prio = p->prio; + + wakeup_task = p; + get_task_struct(wakeup_task); + + local_save_flags(flags); + + tr->data[wakeup_cpu]->preempt_timestamp = now(cpu); + ftrace(tr, tr->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags); + +out_locked: + spin_unlock(&wakeup_lock); +out: + atomic_dec(&tr->data[cpu]->disabled); +} + +notrace void +ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +{ + if (likely(!tracer_enabled)) + return; + + wakeup_check_start(wakeup_trace, wakee, curr); +} + +notrace void +ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) +{ + if (likely(!tracer_enabled)) + return; + + wakeup_check_start(wakeup_trace, wakee, curr); +} + +static notrace void start_wakeup_tracer(struct trace_array *tr) +{ + wakeup_reset(tr); + + /* + * Don't let the tracer_enabled = 1 show up before + * the wakeup_task is reset. This may be overkill since + * wakeup_reset does a spin_unlock after setting the + * wakeup_task to NULL, but I want to be safe. + * This is a slow path anyway. + */ + smp_wmb(); + + tracer_enabled = 1; + + return; +} + +static notrace void stop_wakeup_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +static notrace void wakeup_tracer_init(struct trace_array *tr) +{ + wakeup_trace = tr; + + if (tr->ctrl) + start_wakeup_tracer(tr); +} + +static notrace void wakeup_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) { + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); + } +} + +static void wakeup_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_wakeup_tracer(tr); + else + stop_wakeup_tracer(tr); +} + +static void notrace wakeup_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_wakeup_tracer(iter->tr); +} + +static void notrace wakeup_tracer_close(struct trace_iterator *iter) +{ + /* forget about any processes we were recording */ + if (iter->tr->ctrl) + start_wakeup_tracer(iter->tr); +} + +static struct tracer wakeup_tracer __read_mostly = +{ + .name = "wakeup", + .init = wakeup_tracer_init, + .reset = wakeup_tracer_reset, + .open = wakeup_tracer_open, + .close = wakeup_tracer_close, + .ctrl_update = wakeup_tracer_ctrl_update, + .print_max = 1, +}; + +__init static int init_wakeup_tracer(void) +{ + int ret; + + ret = register_tracer(&wakeup_tracer); + if (ret) + return ret; + + return 0; +} +device_initcall(init_wakeup_tracer); -- cgit v1.2.3-70-g09d2 From 81d68a96a39844853b37f20cc8282d9b65b78ef3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: trace irq disabled critical timings This patch adds latency tracing for critical timings (how long interrupts are disabled for). "irqsoff" is added to /debugfs/tracing/available_tracers Note: tracing_max_latency also holds the max latency for irqsoff (in usecs). (default to large number so one must start latency tracing) tracing_thresh threshold (in usecs) to always print out if irqs off is detected to be longer than stated here. If irq_thresh is non-zero, then max_irq_latency is ignored. Here's an example of a trace with ftrace_enabled = 0 ======= preemption latency trace v1.1.5 on 2.6.24-rc7 Signed-off-by: Ingo Molnar -------------------------------------------------------------------- latency: 100 us, #3/3, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- => started at: _spin_lock_irqsave+0x2a/0xb7 => ended at: _spin_unlock_irqrestore+0x32/0x5f _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / swapper-0 1d.s3 0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000]) swapper-0 1d.s3 100us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1d.s3 100us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f) vim:ft=help ======= And this is a trace with ftrace_enabled == 1 ======= preemption latency trace v1.1.5 on 2.6.24-rc7 -------------------------------------------------------------------- latency: 102 us, #12/12, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2) ----------------- | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0) ----------------- => started at: _spin_lock_irqsave+0x2a/0xb7 => ended at: _spin_unlock_irqrestore+0x32/0x5f _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / swapper-0 1dNs3 0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000]) swapper-0 1dNs3 46us : e1000_read_phy_reg+0x16/0x225 [e1000] (e1000_update_stats+0x5e2/0x64c [e1000]) swapper-0 1dNs3 46us : e1000_swfw_sync_acquire+0x10/0x99 [e1000] (e1000_read_phy_reg+0x49/0x225 [e1000]) swapper-0 1dNs3 46us : e1000_get_hw_eeprom_semaphore+0x12/0xa6 [e1000] (e1000_swfw_sync_acquire+0x36/0x99 [e1000]) swapper-0 1dNs3 47us : __const_udelay+0x9/0x47 (e1000_read_phy_reg+0x116/0x225 [e1000]) swapper-0 1dNs3 47us+: __delay+0x9/0x50 (__const_udelay+0x45/0x47) swapper-0 1dNs3 97us : preempt_schedule+0xc/0x84 (__delay+0x4e/0x50) swapper-0 1dNs3 98us : e1000_swfw_sync_release+0xc/0x55 [e1000] (e1000_read_phy_reg+0x211/0x225 [e1000]) swapper-0 1dNs3 99us+: e1000_put_hw_eeprom_semaphore+0x9/0x35 [e1000] (e1000_swfw_sync_release+0x50/0x55 [e1000]) swapper-0 1dNs3 101us : _spin_unlock_irqrestore+0xe/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1dNs3 102us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000]) swapper-0 1dNs3 102us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f) vim:ft=help ======= Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_64.c | 3 + arch/x86/lib/Makefile | 1 + arch/x86/lib/thunk_32.S | 47 +++++ arch/x86/lib/thunk_64.S | 19 +- include/asm-x86/irqflags.h | 24 +-- include/linux/ftrace.h | 8 + include/linux/irqflags.h | 12 +- kernel/fork.c | 2 +- kernel/lockdep.c | 23 ++- kernel/printk.c | 2 + kernel/trace/Kconfig | 18 ++ kernel/trace/Makefile | 1 + kernel/trace/trace_irqsoff.c | 402 +++++++++++++++++++++++++++++++++++++++++++ 13 files changed, 531 insertions(+), 31 deletions(-) create mode 100644 arch/x86/lib/thunk_32.S create mode 100644 kernel/trace/trace_irqsoff.c (limited to 'include/linux') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e2319f39988..dd349c92f05 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -165,7 +165,10 @@ void cpu_idle(void) */ local_irq_disable(); enter_idle(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); idle(); + start_critical_timings(); /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 76f60f52a88..84aa2883fe1 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_SMP) := msr-on-cpu.o lib-y := delay_$(BITS).o +lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o lib-y += memcpy_$(BITS).o diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S new file mode 100644 index 00000000000..650b11e00ec --- /dev/null +++ b/arch/x86/lib/thunk_32.S @@ -0,0 +1,47 @@ +/* + * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) + * Copyright 2008 by Steven Rostedt, Red Hat, Inc + * (inspired by Andi Kleen's thunk_64.S) + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + + #include + +#define ARCH_TRACE_IRQS_ON \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_on; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_TRACE_IRQS_OFF \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_off; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#ifdef CONFIG_TRACE_IRQFLAGS + /* put return address in eax (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + pushl %eax + pushl %ecx + pushl %edx + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + call \func + popl %edx + popl %ecx + popl %eax + ret + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller +#endif diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index e009251d4e9..bf9a7d5a542 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -2,6 +2,7 @@ * Save registers before calling assembly functions. This avoids * disturbance of register allocation in some inline assembly constructs. * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. * Subject to the GNU public license, v.2. No warranty of any kind. */ @@ -42,8 +43,22 @@ #endif #ifdef CONFIG_TRACE_IRQFLAGS - thunk trace_hardirqs_on_thunk,trace_hardirqs_on - thunk trace_hardirqs_off_thunk,trace_hardirqs_off + /* put return address in rdi (arg1) */ + .macro thunk_ra name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + /* SAVE_ARGS pushs 9 elements */ + /* the next element would be the rip */ + movq 9*8(%rsp), %rdi + call \func + jmp restore + CFI_ENDPROC + .endm + + thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller + thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h index c242527f970..24d71b1eb18 100644 --- a/include/asm-x86/irqflags.h +++ b/include/asm-x86/irqflags.h @@ -179,8 +179,6 @@ static inline void trace_hardirqs_fixup(void) * have a reliable stack. x86_64 only. */ #define SWAPGS_UNSAFE_STACK swapgs -#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk -#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk #define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk #define ARCH_LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ @@ -192,24 +190,6 @@ static inline void trace_hardirqs_fixup(void) TRACE_IRQS_OFF; #else -#define ARCH_TRACE_IRQS_ON \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_on; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - -#define ARCH_TRACE_IRQS_OFF \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_off; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - #define ARCH_LOCKDEP_SYS_EXIT \ pushl %eax; \ pushl %ecx; \ @@ -223,8 +203,8 @@ static inline void trace_hardirqs_fixup(void) #endif #ifdef CONFIG_TRACE_IRQFLAGS -# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON -# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; #else # define TRACE_IRQS_ON # define TRACE_IRQS_OFF diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db8a5e7abe4..0a20445dcbc 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -50,4 +50,12 @@ extern void mcount(void); # define CALLER_ADDR5 0UL #endif +#ifdef CONFIG_IRQSOFF_TRACER + extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1); + extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1); +#else +# define time_hardirqs_on(a0, a1) do { } while (0) +# define time_hardirqs_off(a0, a1) do { } while (0) +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index e600c4e9b8c..5b711d4e9fd 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -12,10 +12,10 @@ #define _LINUX_TRACE_IRQFLAGS_H #ifdef CONFIG_TRACE_IRQFLAGS - extern void trace_hardirqs_on(void); - extern void trace_hardirqs_off(void); extern void trace_softirqs_on(unsigned long ip); extern void trace_softirqs_off(unsigned long ip); + extern void trace_hardirqs_on(void); + extern void trace_hardirqs_off(void); # define trace_hardirq_context(p) ((p)->hardirq_context) # define trace_softirq_context(p) ((p)->softirq_context) # define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled) @@ -41,6 +41,14 @@ # define INIT_TRACE_IRQFLAGS #endif +#ifdef CONFIG_IRQSOFF_TRACER + extern void stop_critical_timings(void); + extern void start_critical_timings(void); +#else +# define stop_critical_timings() do { } while (0) +# define start_critical_timings() do { } while (0) +#endif + #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT #include diff --git a/kernel/fork.c b/kernel/fork.c index 19908b26cf8..d66d676dc36 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, rt_mutex_init_task(p); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP) DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 81a4e4a3f08..e21924365ea 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -982,7 +983,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) return 1; } -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency @@ -1680,7 +1681,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: @@ -2013,11 +2014,13 @@ void early_boot_irqs_on(void) /* * Hardirqs will be enabled: */ -void trace_hardirqs_on(void) +void notrace trace_hardirqs_on_caller(unsigned long a0) { struct task_struct *curr = current; unsigned long ip; + time_hardirqs_on(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2055,16 +2058,23 @@ void trace_hardirqs_on(void) curr->hardirq_enable_event = ++curr->irq_events; debug_atomic_inc(&hardirqs_on_events); } +EXPORT_SYMBOL(trace_hardirqs_on_caller); +void notrace trace_hardirqs_on(void) +{ + trace_hardirqs_on_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off(void) +void notrace trace_hardirqs_off_caller(unsigned long a0) { struct task_struct *curr = current; + time_hardirqs_off(CALLER_ADDR0, a0); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2082,7 +2092,12 @@ void trace_hardirqs_off(void) } else debug_atomic_inc(&redundant_hardirqs_off); } +EXPORT_SYMBOL(trace_hardirqs_off_caller); +void notrace trace_hardirqs_off(void) +{ + trace_hardirqs_off_caller(CALLER_ADDR0); +} EXPORT_SYMBOL(trace_hardirqs_off); /* diff --git a/kernel/printk.c b/kernel/printk.c index 8fb01c32aa3..ae7d5b9e535 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1041,7 +1041,9 @@ void release_console_sem(void) _log_end = log_end; con_start = log_end; /* Flush */ spin_unlock(&logbuf_lock); + stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); + start_critical_timings(); local_irq_restore(flags); } console_locked = 0; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 892ecc94a82..896df1cf6ad 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -26,6 +26,24 @@ config FTRACE (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. +config IRQSOFF_TRACER + bool "Interrupts-off Latency Tracer" + default n + depends on TRACE_IRQFLAGS_SUPPORT + depends on GENERIC_TIME + select TRACE_IRQFLAGS + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + config SCHED_TRACER bool "Scheduling Latency Tracer" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5508cdb19ae..46be8647fb6 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c new file mode 100644 index 00000000000..a9131b0cf1a --- /dev/null +++ b/kernel/trace/trace_irqsoff.c @@ -0,0 +1,402 @@ +/* + * trace irqs off criticall timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * From code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *irqsoff_trace __read_mostly; +static int tracer_enabled __read_mostly; + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +#ifdef CONFIG_FTRACE +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void notrace +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (likely(!tracer_enabled)) + return; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = irqsoff_tracer_call, +}; +#endif /* CONFIG_FTRACE */ + +/* + * Should this new latency be reported/recorded? + */ +static int notrace report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void notrace +check_critical_timing(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long parent_ip, + int cpu) +{ + unsigned long latency, t0, t1; + cycle_t T0, T1, T2, delta; + unsigned long flags; + + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = data->preempt_timestamp; + T1 = now(cpu); + delta = T1-T0; + + local_save_flags(flags); + + if (!report_latency(delta)) + goto out; + + ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); + /* + * Update the timestamp, because the trace entry above + * might change it (it can only get larger so the latency + * is fair to be reported): + */ + T2 = now(cpu); + + delta = T2-T0; + + latency = nsecs_to_usecs(delta); + + if (data->critical_sequence != max_sequence) + goto out; + + tracing_max_latency = delta; + t0 = nsecs_to_usecs(T0); + t1 = nsecs_to_usecs(T1); + + data->critical_end = parent_ip; + + update_max_tr_single(tr, current, cpu); + + if (tracing_thresh) + printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section " + "violates %lu us threshold.\n" + " => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, nsecs_to_usecs(tracing_thresh), t0); + else + printk(KERN_INFO "(%16s-%-5d|#%d):" + " new %lu us maximum-latency " + "critical section.\n => started at timestamp %lu: ", + current->comm, current->pid, + raw_smp_processor_id(), + latency, t0); + + print_symbol(KERN_CONT "<%s>\n", data->critical_start); + printk(KERN_CONT " => ended at timestamp %lu: ", t1); + print_symbol(KERN_CONT "<%s>\n", data->critical_end); + dump_stack(); + t1 = nsecs_to_usecs(now(cpu)); + printk(KERN_CONT " => dump-end timestamp %lu\n\n", t1); + + max_sequence++; + +out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = now(cpu); + tracing_reset(data); + ftrace(tr, data, CALLER_ADDR0, parent_ip, flags); +} + +static inline void notrace +start_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(!data) || unlikely(!data->trace) || + data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + data->critical_sequence = max_sequence; + data->preempt_timestamp = now(cpu); + data->critical_start = parent_ip; + tracing_reset(data); + + local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + + atomic_dec(&data->disabled); +} + +static inline void notrace +stop_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(!data) || unlikely(!data->trace) || + !data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + check_critical_timing(tr, data, parent_ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +} + +void notrace start_critical_timings(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +void notrace stop_critical_timings(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +#ifdef CONFIG_PROVE_LOCKING +void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(a0, a1); +} + +void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(a0, a1); +} + +#else /* !CONFIG_PROVE_LOCKING */ + +/* + * Stubs: + */ + +void early_boot_irqs_off(void) +{ +} + +void early_boot_irqs_on(void) +{ +} + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void notrace trace_hardirqs_on(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void notrace trace_hardirqs_off(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +void notrace trace_hardirqs_on_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + stop_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +void notrace trace_hardirqs_off_caller(unsigned long caller_addr) +{ + unsigned long flags; + + local_save_flags(flags); + + if (irqs_disabled_flags(flags)) + start_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +#endif /* CONFIG_PROVE_LOCKING */ + +static void start_irqsoff_tracer(struct trace_array *tr) +{ + tracer_enabled = 1; + register_ftrace_function(&trace_ops); +} + +static void stop_irqsoff_tracer(struct trace_array *tr) +{ + unregister_ftrace_function(&trace_ops); + tracer_enabled = 0; +} + +static void irqsoff_tracer_init(struct trace_array *tr) +{ + irqsoff_trace = tr; + /* make sure that the tracer is visibel */ + smp_wmb(); + + if (tr->ctrl) + start_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_irqsoff_tracer(tr); +} + +static void irqsoff_tracer_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_irqsoff_tracer(tr); + else + stop_irqsoff_tracer(tr); +} + +static void notrace irqsoff_tracer_open(struct trace_iterator *iter) +{ + /* stop the trace while dumping */ + if (iter->tr->ctrl) + stop_irqsoff_tracer(iter->tr); +} + +static void notrace irqsoff_tracer_close(struct trace_iterator *iter) +{ + if (iter->tr->ctrl) + start_irqsoff_tracer(iter->tr); +} + +static struct tracer irqsoff_tracer __read_mostly = +{ + .name = "irqsoff", + .init = irqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; + +__init static int init_irqsoff_tracer(void) +{ + register_tracer(&irqsoff_tracer); + + return 0; +} +device_initcall(init_irqsoff_tracer); -- cgit v1.2.3-70-g09d2 From 6cd8a4bb2f97527a9ceb30bc77ea4e959c6a95e3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: trace preempt off critical timings Add preempt off timings. A lot of kernel core code is taken from the RT patch latency trace that was written by Ingo Molnar. This adds "preemptoff" and "preemptirqsoff" to /debugfs/tracing/available_tracers Now instead of just tracing irqs off, preemption off can be selected to be recorded. When this is selected, it shares the same files as irqs off timings. One can either trace preemption off, irqs off, or one or the other off. By echoing "preemptoff" into /debugfs/tracing/current_tracer, recording of preempt off only is performed. "irqsoff" will only record the time irqs are disabled, but "preemptirqsoff" will take the total time irqs or preemption are disabled. Runtime switching of these options is now supported by simpling echoing in the appropriate trace name into /debugfs/tracing/current_tracer. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_32.c | 3 + include/linux/ftrace.h | 8 ++ include/linux/irqflags.h | 3 +- include/linux/preempt.h | 2 +- kernel/sched.c | 24 +++++- kernel/trace/Kconfig | 25 ++++++ kernel/trace/Makefile | 1 + kernel/trace/trace_irqsoff.c | 184 +++++++++++++++++++++++++++++++------------ 8 files changed, 197 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8476dfbb60..a30aa1f2607 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -185,7 +185,10 @@ void cpu_idle(void) local_irq_disable(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; + /* Don't trace irqs off for idle */ + stop_critical_timings(); idle(); + start_critical_timings(); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0a20445dcbc..740c97dcf9c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -58,4 +58,12 @@ extern void mcount(void); # define time_hardirqs_off(a0, a1) do { } while (0) #endif +#ifdef CONFIG_PREEMPT_TRACER + extern void notrace trace_preempt_on(unsigned long a0, unsigned long a1); + extern void notrace trace_preempt_off(unsigned long a0, unsigned long a1); +#else +# define trace_preempt_on(a0, a1) do { } while (0) +# define trace_preempt_off(a0, a1) do { } while (0) +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 5b711d4e9fd..2b1c2e58566 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -41,7 +41,8 @@ # define INIT_TRACE_IRQFLAGS #endif -#ifdef CONFIG_IRQSOFF_TRACER +#if defined(CONFIG_IRQSOFF_TRACER) || \ + defined(CONFIG_PREEMPT_TRACER) extern void stop_critical_timings(void); extern void start_critical_timings(void); #else diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 36b03d50bf4..72b1a10a59b 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -10,7 +10,7 @@ #include #include -#ifdef CONFIG_DEBUG_PREEMPT +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); #else diff --git a/kernel/sched.c b/kernel/sched.c index 73e60085236..328494e28df 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -4365,26 +4366,44 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +static inline unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} void __kprobes add_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; +#endif preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } EXPORT_SYMBOL(add_preempt_count); void __kprobes sub_preempt_count(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* * Underflow? */ @@ -4396,7 +4415,10 @@ void __kprobes sub_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) return; +#endif + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); preempt_count() -= val; } EXPORT_SYMBOL(sub_preempt_count); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 896df1cf6ad..6430016b98e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -44,6 +44,31 @@ config IRQSOFF_TRACER echo 0 > /debugfs/tracing/tracing_max_latency + (Note that kernel size and overhead increases with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config PREEMPT_TRACER + bool "Preemption-off Latency Tracer" + default n + depends on GENERIC_TIME + depends on PREEMPT + select TRACING + select TRACER_MAX_TRACE + help + This option measures the time spent in preemption off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /debugfs/tracing/tracing_max_latency + + (Note that kernel size and overhead increases with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + config SCHED_TRACER bool "Scheduling Latency Tracer" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 46be8647fb6..3fec653d653 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o +obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a9131b0cf1a..8b1231633dc 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -21,6 +21,36 @@ static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; +static DEFINE_PER_CPU(int, tracing_cpu); + +enum { + TRACER_IRQS_OFF = (1 << 1), + TRACER_PREEMPT_OFF = (1 << 2), +}; + +static int trace_type __read_mostly; + +#ifdef CONFIG_PREEMPT_TRACER +static inline int notrace +preempt_trace(void) +{ + return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); +} +#else +# define preempt_trace() (0) +#endif + +#ifdef CONFIG_IRQSOFF_TRACER +static inline int notrace +irq_trace(void) +{ + return ((trace_type & TRACER_IRQS_OFF) && + irqs_disabled()); +} +#else +# define irq_trace() (0) +#endif + /* * Sequence count - we record it when starting a measurement and * skip the latency if the sequence has changed - some other section @@ -44,14 +74,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) long disabled; int cpu; - if (likely(!tracer_enabled)) + if (likely(!__get_cpu_var(tracing_cpu))) return; local_save_flags(flags); - if (!irqs_disabled_flags(flags)) - return; - cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); @@ -171,23 +198,29 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (likely(!tracer_enabled)) return; + if (__get_cpu_var(tracing_cpu)) + return; + cpu = raw_smp_processor_id(); data = tr->data[cpu]; if (unlikely(!data) || unlikely(!data->trace) || - data->critical_start || atomic_read(&data->disabled)) + atomic_read(&data->disabled)) return; atomic_inc(&data->disabled); data->critical_sequence = max_sequence; data->preempt_timestamp = now(cpu); - data->critical_start = parent_ip; + data->critical_start = parent_ip ? : ip; tracing_reset(data); local_save_flags(flags); + ftrace(tr, data, ip, parent_ip, flags); + __get_cpu_var(tracing_cpu) = 1; + atomic_dec(&data->disabled); } @@ -199,7 +232,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; - if (likely(!tracer_enabled)) + /* Always clear the tracing cpu on stopping the trace */ + if (unlikely(__get_cpu_var(tracing_cpu))) + __get_cpu_var(tracing_cpu) = 0; + else + return; + + if (!tracer_enabled) return; cpu = raw_smp_processor_id(); @@ -212,49 +251,35 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); ftrace(tr, data, ip, parent_ip, flags); - check_critical_timing(tr, data, parent_ip, cpu); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); } +/* start and stop critical timings used to for stoppage (in idle) */ void notrace start_critical_timings(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } void notrace stop_critical_timings(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } +#ifdef CONFIG_IRQSOFF_TRACER #ifdef CONFIG_PROVE_LOCKING void notrace time_hardirqs_on(unsigned long a0, unsigned long a1) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(a0, a1); } void notrace time_hardirqs_off(unsigned long a0, unsigned long a1) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(a0, a1); } @@ -289,49 +314,46 @@ inline void print_irqtrace_events(struct task_struct *curr) */ void notrace trace_hardirqs_on(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_on); void notrace trace_hardirqs_off(void) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } EXPORT_SYMBOL(trace_hardirqs_off); void notrace trace_hardirqs_on_caller(unsigned long caller_addr) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); void notrace trace_hardirqs_off_caller(unsigned long caller_addr) { - unsigned long flags; - - local_save_flags(flags); - - if (irqs_disabled_flags(flags)) + if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_PROVE_LOCKING */ +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +void notrace trace_preempt_on(unsigned long a0, unsigned long a1) +{ + stop_critical_timing(a0, a1); +} + +void notrace trace_preempt_off(unsigned long a0, unsigned long a1) +{ + start_critical_timing(a0, a1); +} +#endif /* CONFIG_PREEMPT_TRACER */ static void start_irqsoff_tracer(struct trace_array *tr) { @@ -345,7 +367,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr) tracer_enabled = 0; } -static void irqsoff_tracer_init(struct trace_array *tr) +static void __irqsoff_tracer_init(struct trace_array *tr) { irqsoff_trace = tr; /* make sure that the tracer is visibel */ @@ -382,6 +404,13 @@ static void notrace irqsoff_tracer_close(struct trace_iterator *iter) start_irqsoff_tracer(iter->tr); } +#ifdef CONFIG_IRQSOFF_TRACER +static void irqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF; + + __irqsoff_tracer_init(tr); +} static struct tracer irqsoff_tracer __read_mostly = { .name = "irqsoff", @@ -392,10 +421,65 @@ static struct tracer irqsoff_tracer __read_mostly = .ctrl_update = irqsoff_tracer_ctrl_update, .print_max = 1, }; +# define register_irqsoff(trace) register_tracer(&trace) +#else +# define register_irqsoff(trace) do { } while (0) +#endif + +#ifdef CONFIG_PREEMPT_TRACER +static void preemptoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptoff_tracer __read_mostly = +{ + .name = "preemptoff", + .init = preemptoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; +# define register_preemptoff(trace) register_tracer(&trace) +#else +# define register_preemptoff(trace) do { } while (0) +#endif + +#if defined(CONFIG_IRQSOFF_TRACER) && \ + defined(CONFIG_PREEMPT_TRACER) + +static void preemptirqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); +} + +static struct tracer preemptirqsoff_tracer __read_mostly = +{ + .name = "preemptirqsoff", + .init = preemptirqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .open = irqsoff_tracer_open, + .close = irqsoff_tracer_close, + .ctrl_update = irqsoff_tracer_ctrl_update, + .print_max = 1, +}; + +# define register_preemptirqsoff(trace) register_tracer(&trace) +#else +# define register_preemptirqsoff(trace) do { } while (0) +#endif __init static int init_irqsoff_tracer(void) { - register_tracer(&irqsoff_tracer); + register_irqsoff(irqsoff_tracer); + register_preemptoff(preemptoff_tracer); + register_preemptirqsoff(preemptirqsoff_tracer); return 0; } -- cgit v1.2.3-70-g09d2 From 3d0833953e1b98b79ddf491dd49229eef9baeac1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:42 +0200 Subject: ftrace: dynamic enabling/disabling of function calls This patch adds a feature to dynamically replace the ftrace code with the jmps to allow a kernel with ftrace configured to run as fast as it can without it configured. The way this works, is on bootup (if ftrace is enabled), a ftrace function is registered to record the instruction pointer of all places that call the function. Later, if there's still any code to patch, a kthread is awoken (rate limited to at most once a second) that performs a stop_machine, and replaces all the code that was called with a jmp over the call to ftrace. It only replaces what was found the previous time. Typically the system reaches equilibrium quickly after bootup and there's no code patching needed at all. e.g. call ftrace /* 5 bytes */ is replaced with jmp 3f /* jmp is 2 bytes and we jump 3 forward */ 3: When we want to enable ftrace for function tracing, the IP recording is removed, and stop_machine is called again to replace all the locations of that were recorded back to the call of ftrace. When it is disabled, we replace the code back to the jmp. Allocation is done by the kthread. If the ftrace recording function is called, and we don't have any record slots available, then we simply skip that call. Once a second a new page (if needed) is allocated for recording new ftrace function calls. A large batch is allocated at boot up to get most of the calls there. Because we do this via stop_machine, we don't have to worry about another CPU executing a ftrace call as we modify it. But we do need to worry about NMI's so all functions that might be called via nmi must be annotated with notrace_nmi. When this code is configured in, the NMI code will not call notrace. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/ftrace.c | 237 +++++++++++++++++++++++++++++++ include/linux/ftrace.h | 18 +++ kernel/trace/Kconfig | 17 +++ kernel/trace/ftrace.c | 356 ++++++++++++++++++++++++++++++++++++++++++----- 5 files changed, 597 insertions(+), 32 deletions(-) create mode 100644 arch/x86/kernel/ftrace.c (limited to 'include/linux') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b472..e142091524b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c new file mode 100644 index 00000000000..5dd58136ef0 --- /dev/null +++ b/arch/x86/kernel/ftrace.c @@ -0,0 +1,237 @@ +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt + * + * Thanks goes to Ingo Molnar, for suggesting the idea. + * Mathieu Desnoyers, for suggesting postponing the modifications. + * Arjan van de Ven, for keeping me straight, and explaining to me + * the dangers of modifying code on the run. + */ + +#include +#include +#include +#include +#include +#include + +#define CALL_BACK 5 + +#define JMPFWD 0x03eb + +static unsigned short ftrace_jmp = JMPFWD; + +struct ftrace_record { + struct dyn_ftrace rec; + int failed; +} __attribute__((packed)); + +struct ftrace_page { + struct ftrace_page *next; + int index; + struct ftrace_record records[]; +} __attribute__((packed)); + +#define ENTRIES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record)) + +/* estimate from running different kernels */ +#define NR_TO_INIT 10000 + +#define MCOUNT_ADDR ((long)(&mcount)) + +union ftrace_code_union { + char code[5]; + struct { + char e8; + int offset; + } __attribute__((packed)); +}; + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + +notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +{ + struct ftrace_record *rec; + unsigned short save; + + ip -= CALL_BACK; + save = *(short *)ip; + + /* If this was already converted, skip it */ + if (save == JMPFWD) + return NULL; + + if (ftrace_pages->index == ENTRIES_PER_PAGE) { + if (!ftrace_pages->next) + return NULL; + ftrace_pages = ftrace_pages->next; + } + + rec = &ftrace_pages->records[ftrace_pages->index++]; + + return &rec->rec; +} + +static int notrace +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code) +{ + unsigned short old = *(unsigned short *)old_code; + unsigned short new = *(unsigned short *)new_code; + unsigned short replaced; + int faulted = 0; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. + * + * No real locking needed, this code is run through + * kstop_machine. + */ + asm volatile ( + "1: lock\n" + " cmpxchg %w3, (%2)\n" + "2:\n" + ".section .fixup, \"ax\"\n" + " movl $1, %0\n" + "3: jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b, 3b) + : "=r"(faulted), "=a"(replaced) + : "r"(ip), "r"(new), "0"(faulted), "a"(old) + : "memory"); + sync_core(); + + if (replaced != old) + faulted = 2; + + return faulted; +} + +static int notrace ftrace_calc_offset(long ip) +{ + return (int)(MCOUNT_ADDR - ip); +} + +notrace void ftrace_code_disable(struct dyn_ftrace *rec) +{ + unsigned long ip; + union ftrace_code_union save; + struct ftrace_record *r = + container_of(rec, struct ftrace_record, rec); + + ip = rec->ip; + + save.e8 = 0xe8; + save.offset = ftrace_calc_offset(ip); + + /* move the IP back to the start of the call */ + ip -= CALL_BACK; + + r->failed = ftrace_modify_code(ip, save.code, (char *)&ftrace_jmp); +} + +static void notrace ftrace_replace_code(int saved) +{ + unsigned char *new = NULL, *old = NULL; + struct ftrace_record *rec; + struct ftrace_page *pg; + unsigned long ip; + int i; + + if (saved) + old = (char *)&ftrace_jmp; + else + new = (char *)&ftrace_jmp; + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + union ftrace_code_union calc; + rec = &pg->records[i]; + + /* don't modify code that has already faulted */ + if (rec->failed) + continue; + + ip = rec->rec.ip; + + calc.e8 = 0xe8; + calc.offset = ftrace_calc_offset(ip); + + if (saved) + new = calc.code; + else + old = calc.code; + + ip -= CALL_BACK; + + rec->failed = ftrace_modify_code(ip, old, new); + } + } + +} + +notrace void ftrace_startup_code(void) +{ + ftrace_replace_code(1); +} + +notrace void ftrace_shutdown_code(void) +{ + ftrace_replace_code(0); +} + +notrace void ftrace_shutdown_replenish(void) +{ + if (ftrace_pages->next) + return; + + /* allocate another page */ + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); +} + +notrace int ftrace_shutdown_arch_init(void) +{ + struct ftrace_page *pg; + int cnt; + int i; + + /* allocate a few pages */ + ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages_start) + return -1; + + /* + * Allocate a few more pages. + * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: + * a) know how many pages to allocate. + * and/or + * b) set up the table then. + * + * The dynamic code is still necessary for + * modules. + */ + + pg = ftrace_pages = ftrace_pages_start; + + cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + + for (i = 0; i < cnt; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + + /* If we fail, we'll try later anyway */ + if (!pg->next) + break; + + pg = pg->next; + } + + return 0; +} diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 740c97dcf9c..90dbc0ee204 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -32,6 +32,24 @@ extern void mcount(void); # define clear_ftrace_function(ops) do { } while (0) #endif /* CONFIG_FTRACE */ +#ifdef CONFIG_DYNAMIC_FTRACE +# define FTRACE_HASHBITS 10 +# define FTRACE_HASHSIZE (1< +#include +#include +#include +#include +#include #include +#include +#include +#include + +#include "trace.h" -static DEFINE_SPINLOCK(ftrace_func_lock); +static DEFINE_SPINLOCK(ftrace_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, @@ -44,21 +53,21 @@ notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip) } /** - * register_ftrace_function - register a function for profiling - * @ops - ops structure that holds the function for profiling. - * - * Register a function to be called by all functions in the - * kernel. + * clear_ftrace_function - reset the ftrace function * - * Note: @ops->func and all the functions it calls must be labeled - * with "notrace", otherwise it will go into a - * recursive loop. + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag */ -int register_ftrace_function(struct ftrace_ops *ops) +void clear_ftrace_function(void) { - unsigned long flags; + ftrace_trace_function = ftrace_stub; +} + +static int notrace __register_ftrace_function(struct ftrace_ops *ops) +{ + /* Should never be called by interrupts */ + spin_lock(&ftrace_lock); - spin_lock_irqsave(&ftrace_func_lock, flags); ops->next = ftrace_list; /* * We are entering ops into the ftrace_list but another @@ -68,6 +77,7 @@ int register_ftrace_function(struct ftrace_ops *ops) */ smp_wmb(); ftrace_list = ops; + /* * For one func, simply call it directly. * For more than one func, call the chain. @@ -76,28 +86,22 @@ int register_ftrace_function(struct ftrace_ops *ops) ftrace_trace_function = ops->func; else ftrace_trace_function = ftrace_list_func; - spin_unlock_irqrestore(&ftrace_func_lock, flags); + + spin_unlock(&ftrace_lock); return 0; } -/** - * unregister_ftrace_function - unresgister a function for profiling. - * @ops - ops structure that holds the function to unregister - * - * Unregister a function that was added to be called by ftrace profiling. - */ -int unregister_ftrace_function(struct ftrace_ops *ops) +static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) { - unsigned long flags; struct ftrace_ops **p; int ret = 0; - spin_lock_irqsave(&ftrace_func_lock, flags); + spin_lock(&ftrace_lock); /* - * If we are the only function, then the ftrace pointer is - * pointing directly to that function. + * If we are removing the last function, then simply point + * to the ftrace_stub. */ if (ftrace_list == ops && ops->next == &ftrace_list_end) { ftrace_trace_function = ftrace_stub; @@ -117,22 +121,310 @@ int unregister_ftrace_function(struct ftrace_ops *ops) *p = (*p)->next; /* If we only have one func left, then call that directly */ - if (ftrace_list->next == &ftrace_list_end) + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) ftrace_trace_function = ftrace_list->func; out: - spin_unlock_irqrestore(&ftrace_func_lock, flags); + spin_unlock(&ftrace_lock); + + return ret; +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; + +static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); + +static DEFINE_SPINLOCK(ftrace_shutdown_lock); +static DEFINE_MUTEX(ftraced_lock); + +static int ftraced_trigger; +static int ftraced_suspend; + +static int ftrace_record_suspend; + +static inline int +notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key) +{ + struct dyn_ftrace *p; + struct hlist_node *t; + int found = 0; + + hlist_for_each_entry(p, t, &ftrace_hash[key], node) { + if (p->ip == ip) { + found = 1; + break; + } + } + + return found; +} + +static inline void notrace +ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) +{ + hlist_add_head(&node->node, &ftrace_hash[key]); +} + +static void notrace +ftrace_record_ip(unsigned long ip, unsigned long parent_ip) +{ + struct dyn_ftrace *node; + unsigned long flags; + unsigned long key; + int resched; + int atomic; + + resched = need_resched(); + preempt_disable_notrace(); + + /* We simply need to protect against recursion */ + __get_cpu_var(ftrace_shutdown_disable_cpu)++; + if (__get_cpu_var(ftrace_shutdown_disable_cpu) != 1) + goto out; + + if (unlikely(ftrace_record_suspend)) + goto out; + + key = hash_long(ip, FTRACE_HASHBITS); + + WARN_ON_ONCE(key >= FTRACE_HASHSIZE); + + if (ftrace_ip_in_hash(ip, key)) + goto out; + + atomic = irqs_disabled(); + + spin_lock_irqsave(&ftrace_shutdown_lock, flags); + + /* This ip may have hit the hash before the lock */ + if (ftrace_ip_in_hash(ip, key)) + goto out_unlock; + + /* + * There's a slight race that the ftraced will update the + * hash and reset here. The arch alloc is responsible + * for seeing if the IP has already changed, and if + * it has, the alloc will fail. + */ + node = ftrace_alloc_shutdown_node(ip); + if (!node) + goto out_unlock; + + node->ip = ip; + + ftrace_add_hash(node, key); + + ftraced_trigger = 1; + + out_unlock: + spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); + out: + __get_cpu_var(ftrace_shutdown_disable_cpu)--; + + /* prevent recursion with scheduler */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops ftrace_shutdown_ops __read_mostly = +{ + .func = ftrace_record_ip, +}; + + +static int notrace __ftrace_modify_code(void *data) +{ + void (*func)(void) = data; + + func(); + return 0; +} + +static void notrace ftrace_run_startup_code(void) +{ + stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); +} + +static void notrace ftrace_run_shutdown_code(void) +{ + stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS); +} + +static void notrace ftrace_startup(void) +{ + mutex_lock(&ftraced_lock); + ftraced_suspend++; + if (ftraced_suspend != 1) + goto out; + __unregister_ftrace_function(&ftrace_shutdown_ops); + + ftrace_run_startup_code(); + out: + mutex_unlock(&ftraced_lock); +} + +static void notrace ftrace_shutdown(void) +{ + mutex_lock(&ftraced_lock); + ftraced_suspend--; + if (ftraced_suspend) + goto out; + + ftrace_run_shutdown_code(); + + __register_ftrace_function(&ftrace_shutdown_ops); + out: + mutex_unlock(&ftraced_lock); +} + +static cycle_t ftrace_update_time; +static unsigned long ftrace_update_cnt; +unsigned long ftrace_update_tot_cnt; + +static int notrace __ftrace_update_code(void *ignore) +{ + struct dyn_ftrace *p; + struct hlist_head head; + struct hlist_node *t; + cycle_t start, stop; + int i; + + /* Don't be calling ftrace ops now */ + __unregister_ftrace_function(&ftrace_shutdown_ops); + + start = now(raw_smp_processor_id()); + ftrace_update_cnt = 0; + + /* No locks needed, the machine is stopped! */ + for (i = 0; i < FTRACE_HASHSIZE; i++) { + if (hlist_empty(&ftrace_hash[i])) + continue; + + head = ftrace_hash[i]; + INIT_HLIST_HEAD(&ftrace_hash[i]); + + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry(p, t, &head, node) { + ftrace_code_disable(p); + ftrace_update_cnt++; + } + + } + + stop = now(raw_smp_processor_id()); + ftrace_update_time = stop - start; + ftrace_update_tot_cnt += ftrace_update_cnt; + + __register_ftrace_function(&ftrace_shutdown_ops); return 0; } +static void notrace ftrace_update_code(void) +{ + stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); +} + +static int notrace ftraced(void *ignore) +{ + unsigned long usecs; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + /* check once a second */ + schedule_timeout(HZ); + + mutex_lock(&ftraced_lock); + if (ftraced_trigger && !ftraced_suspend) { + ftrace_record_suspend++; + ftrace_update_code(); + usecs = nsecs_to_usecs(ftrace_update_time); + if (ftrace_update_tot_cnt > 100000) { + ftrace_update_tot_cnt = 0; + pr_info("hm, dftrace overflow: %lu change%s" + " (%lu total) in %lu usec%s\n", + ftrace_update_cnt, + ftrace_update_cnt != 1 ? "s" : "", + ftrace_update_tot_cnt, + usecs, usecs != 1 ? "s" : ""); + WARN_ON_ONCE(1); + } + ftraced_trigger = 0; + ftrace_record_suspend--; + } + mutex_unlock(&ftraced_lock); + + ftrace_shutdown_replenish(); + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __init notrace ftrace_shutdown_init(void) +{ + struct task_struct *p; + int ret; + + ret = ftrace_shutdown_arch_init(); + if (ret) + return ret; + + p = kthread_run(ftraced, NULL, "ftraced"); + if (IS_ERR(p)) + return -1; + + __register_ftrace_function(&ftrace_shutdown_ops); + + return 0; +} + +core_initcall(ftrace_shutdown_init); +#else +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ + /** - * clear_ftrace_function - reset the ftrace function + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. * - * This NULLs the ftrace function and in essence stops - * tracing. There may be lag + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. */ -void clear_ftrace_function(void) +int register_ftrace_function(struct ftrace_ops *ops) { - ftrace_trace_function = ftrace_stub; + ftrace_startup(); + + return __register_ftrace_function(ops); +} + +/** + * unregister_ftrace_function - unresgister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + ret = __unregister_ftrace_function(ops); + + if (ftrace_list == &ftrace_list_end) + ftrace_shutdown(); + + return ret; } -- cgit v1.2.3-70-g09d2 From b0fc494fae96a7089f3651cb451f461c7291244c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: add ftrace_enabled sysctl to disable mcount function This patch adds back the sysctl ftrace_enabled. This time it is defaulted to on, if DYNAMIC_FTRACE is configured. When ftrace_enabled is disabled, the ftrace function is set to the stub return. If DYNAMIC_FTRACE is also configured, on ftrace_enabled = 0, the registered ftrace functions will all be set to jmps, but no more new calls to ftrace recording (used to find the ftrace calling sites) will be called. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 6 +++ kernel/sysctl.c | 11 +++++ kernel/trace/ftrace.c | 125 ++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 124 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 90dbc0ee204..ccd8537dbdb 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -5,6 +5,12 @@ #include +extern int ftrace_enabled; +extern int +ftrace_enable_sysctl(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); + typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); struct ftrace_ops { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29116652dca..efaf7c5500e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -455,6 +456,16 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_FTRACE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ftrace_enable_sysctl, + }, +#endif #ifdef CONFIG_KMOD { .ctl_name = KERN_MODPROBE, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d1ae2ba2527..d3de37299ba 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -20,12 +20,24 @@ #include #include #include +#include #include #include #include "trace.h" +#ifdef CONFIG_DYNAMIC_FTRACE +# define FTRACE_ENABLED_INIT 1 +#else +# define FTRACE_ENABLED_INIT 0 +#endif + +int ftrace_enabled = FTRACE_ENABLED_INIT; +static int last_ftrace_enabled = FTRACE_ENABLED_INIT; + static DEFINE_SPINLOCK(ftrace_lock); +static DEFINE_MUTEX(ftrace_sysctl_lock); + static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, @@ -78,14 +90,16 @@ static int notrace __register_ftrace_function(struct ftrace_ops *ops) smp_wmb(); ftrace_list = ops; - /* - * For one func, simply call it directly. - * For more than one func, call the chain. - */ - if (ops->next == &ftrace_list_end) - ftrace_trace_function = ops->func; - else - ftrace_trace_function = ftrace_list_func; + if (ftrace_enabled) { + /* + * For one func, simply call it directly. + * For more than one func, call the chain. + */ + if (ops->next == &ftrace_list_end) + ftrace_trace_function = ops->func; + else + ftrace_trace_function = ftrace_list_func; + } spin_unlock(&ftrace_lock); @@ -120,10 +134,12 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) *p = (*p)->next; - /* If we only have one func left, then call that directly */ - if (ftrace_list == &ftrace_list_end || - ftrace_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_list->func; + if (ftrace_enabled) { + /* If we only have one func left, then call that directly */ + if (ftrace_list == &ftrace_list_end || + ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + } out: spin_unlock(&ftrace_lock); @@ -263,7 +279,8 @@ static void notrace ftrace_startup(void) goto out; __unregister_ftrace_function(&ftrace_shutdown_ops); - ftrace_run_startup_code(); + if (ftrace_enabled) + ftrace_run_startup_code(); out: mutex_unlock(&ftraced_lock); } @@ -275,13 +292,32 @@ static void notrace ftrace_shutdown(void) if (ftraced_suspend) goto out; - ftrace_run_shutdown_code(); + if (ftrace_enabled) + ftrace_run_shutdown_code(); __register_ftrace_function(&ftrace_shutdown_ops); out: mutex_unlock(&ftraced_lock); } +static void notrace ftrace_startup_sysctl(void) +{ + mutex_lock(&ftraced_lock); + /* ftraced_suspend is true if we want ftrace running */ + if (ftraced_suspend) + ftrace_run_startup_code(); + mutex_unlock(&ftraced_lock); +} + +static void notrace ftrace_shutdown_sysctl(void) +{ + mutex_lock(&ftraced_lock); + /* ftraced_suspend is true if ftrace is running */ + if (ftraced_suspend) + ftrace_run_shutdown_code(); + mutex_unlock(&ftraced_lock); +} + static cycle_t ftrace_update_time; static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; @@ -341,8 +377,9 @@ static int notrace ftraced(void *ignore) /* check once a second */ schedule_timeout(HZ); + mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); - if (ftraced_trigger && !ftraced_suspend) { + if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) { ftrace_record_suspend++; ftrace_update_code(); usecs = nsecs_to_usecs(ftrace_update_time); @@ -360,6 +397,7 @@ static int notrace ftraced(void *ignore) ftrace_record_suspend--; } mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); ftrace_shutdown_replenish(); @@ -389,8 +427,10 @@ static int __init notrace ftrace_shutdown_init(void) core_initcall(ftrace_shutdown_init); #else -# define ftrace_startup() do { } while (0) -# define ftrace_shutdown() do { } while (0) +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ /** @@ -406,9 +446,15 @@ core_initcall(ftrace_shutdown_init); */ int register_ftrace_function(struct ftrace_ops *ops) { + int ret; + + mutex_lock(&ftrace_sysctl_lock); ftrace_startup(); - return __register_ftrace_function(ops); + ret = __register_ftrace_function(ops); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; } /** @@ -421,10 +467,53 @@ int unregister_ftrace_function(struct ftrace_ops *ops) { int ret; + mutex_lock(&ftrace_sysctl_lock); ret = __unregister_ftrace_function(ops); if (ftrace_list == &ftrace_list_end) ftrace_shutdown(); + mutex_unlock(&ftrace_sysctl_lock); + + return ret; +} + +notrace int +ftrace_enable_sysctl(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + mutex_lock(&ftrace_sysctl_lock); + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + + if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) + goto out; + + last_ftrace_enabled = ftrace_enabled; + + if (ftrace_enabled) { + + ftrace_startup_sysctl(); + + /* we are starting ftrace again */ + if (ftrace_list != &ftrace_list_end) { + if (ftrace_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_list->func; + else + ftrace_trace_function = ftrace_list_func; + } + + } else { + /* stopping ftrace calls (just send to ftrace_stub) */ + ftrace_trace_function = ftrace_stub; + + ftrace_shutdown_sysctl(); + } + + out: + mutex_unlock(&ftrace_sysctl_lock); return ret; } -- cgit v1.2.3-70-g09d2 From 3c1720f00bb619302ba19d55986ab565e74d06db Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: move memory management out of arch code This patch moves the memory management of the ftrace records out of the arch code and into the generic code making the arch code simpler. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ftrace.c | 183 ++++++++--------------------------------------- include/linux/ftrace.h | 18 +++-- kernel/trace/ftrace.c | 154 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 192 insertions(+), 163 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 2e060c58b86..b69795efa22 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -23,25 +23,6 @@ /* Long is fine, even if it is only 4 bytes ;-) */ static long *ftrace_nop; -struct ftrace_record { - struct dyn_ftrace rec; - int failed; -} __attribute__((packed)); - -struct ftrace_page { - struct ftrace_page *next; - int index; - struct ftrace_record records[]; -} __attribute__((packed)); - -#define ENTRIES_PER_PAGE \ - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record)) - -/* estimate from running different kernels */ -#define NR_TO_INIT 10000 - -#define MCOUNT_ADDR ((long)(&mcount)) - union ftrace_code_union { char code[5]; struct { @@ -50,33 +31,41 @@ union ftrace_code_union { } __attribute__((packed)); }; -static struct ftrace_page *ftrace_pages_start; -static struct ftrace_page *ftrace_pages; - -notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +notrace int ftrace_ip_converted(unsigned long ip) { - struct ftrace_record *rec; unsigned long save; ip -= CALL_BACK; save = *(long *)ip; - /* If this was already converted, skip it */ - if (save == *ftrace_nop) - return NULL; + return save == *ftrace_nop; +} - if (ftrace_pages->index == ENTRIES_PER_PAGE) { - if (!ftrace_pages->next) - return NULL; - ftrace_pages = ftrace_pages->next; - } +static int notrace ftrace_calc_offset(long ip, long addr) +{ + return (int)(addr - ip); +} - rec = &ftrace_pages->records[ftrace_pages->index++]; +notrace unsigned char *ftrace_nop_replace(void) +{ + return (char *)ftrace_nop; +} + +notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + static union ftrace_code_union calc; - return &rec->rec; + calc.e8 = 0xe8; + calc.offset = ftrace_calc_offset(ip, addr); + + /* + * No locking needed, this must be called via kstop_machine + * which in essence is like running on a uniprocessor machine. + */ + return calc.code; } -static int notrace +notrace int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { @@ -86,6 +75,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char newch = new_code[4]; int faulted = 0; + /* move the IP back to the start of the call */ + ip -= CALL_BACK; + /* * Note: Due to modules and __init, code can * disappear and change, we need to protect against faulting @@ -117,129 +109,12 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return faulted; } -static int notrace ftrace_calc_offset(long ip) -{ - return (int)(MCOUNT_ADDR - ip); -} - -notrace void ftrace_code_disable(struct dyn_ftrace *rec) -{ - unsigned long ip; - union ftrace_code_union save; - struct ftrace_record *r = - container_of(rec, struct ftrace_record, rec); - - ip = rec->ip; - - save.e8 = 0xe8; - save.offset = ftrace_calc_offset(ip); - - /* move the IP back to the start of the call */ - ip -= CALL_BACK; - - r->failed = ftrace_modify_code(ip, save.code, (char *)ftrace_nop); -} - -static void notrace ftrace_replace_code(int saved) -{ - unsigned char *new = NULL, *old = NULL; - struct ftrace_record *rec; - struct ftrace_page *pg; - unsigned long ip; - int i; - - if (saved) - old = (char *)ftrace_nop; - else - new = (char *)ftrace_nop; - - for (pg = ftrace_pages_start; pg; pg = pg->next) { - for (i = 0; i < pg->index; i++) { - union ftrace_code_union calc; - rec = &pg->records[i]; - - /* don't modify code that has already faulted */ - if (rec->failed) - continue; - - ip = rec->rec.ip; - - calc.e8 = 0xe8; - calc.offset = ftrace_calc_offset(ip); - - if (saved) - new = calc.code; - else - old = calc.code; - - ip -= CALL_BACK; - - rec->failed = ftrace_modify_code(ip, old, new); - } - } - -} - -notrace void ftrace_startup_code(void) -{ - ftrace_replace_code(1); -} - -notrace void ftrace_shutdown_code(void) -{ - ftrace_replace_code(0); -} - -notrace void ftrace_shutdown_replenish(void) -{ - if (ftrace_pages->next) - return; - - /* allocate another page */ - ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); -} - -notrace int __init ftrace_shutdown_arch_init(void) +int __init ftrace_dyn_arch_init(void) { const unsigned char *const *noptable = find_nop_table(); - struct ftrace_page *pg; - int cnt; - int i; ftrace_nop = (unsigned long *)noptable[CALL_BACK]; - /* allocate a few pages */ - ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); - if (!ftrace_pages_start) - return -1; - - /* - * Allocate a few more pages. - * - * TODO: have some parser search vmlinux before - * final linking to find all calls to ftrace. - * Then we can: - * a) know how many pages to allocate. - * and/or - * b) set up the table then. - * - * The dynamic code is still necessary for - * modules. - */ - - pg = ftrace_pages = ftrace_pages_start; - - cnt = NR_TO_INIT / ENTRIES_PER_PAGE; - - for (i = 0; i < cnt; i++) { - pg->next = (void *)get_zeroed_page(GFP_KERNEL); - - /* If we fail, we'll try later anyway */ - if (!pg->next) - break; - - pg = pg->next; - } - return 0; } + diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ccd8537dbdb..d509ad6c9cb 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -42,19 +42,23 @@ extern void mcount(void); # define FTRACE_HASHBITS 10 # define FTRACE_HASHSIZE (1<node, &ftrace_hash[key]); } +static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +{ + /* If this was already converted, skip it */ + if (ftrace_ip_converted(ip)) + return NULL; + + if (ftrace_pages->index == ENTRIES_PER_PAGE) { + if (!ftrace_pages->next) + return NULL; + ftrace_pages = ftrace_pages->next; + } + + return &ftrace_pages->records[ftrace_pages->index++]; +} + static void notrace ftrace_record_ip(unsigned long ip, unsigned long parent_ip) { @@ -252,6 +282,62 @@ static struct ftrace_ops ftrace_shutdown_ops __read_mostly = .func = ftrace_record_ip, }; +#define MCOUNT_ADDR ((long)(&mcount)) + +static void notrace ftrace_replace_code(int saved) +{ + unsigned char *new = NULL, *old = NULL; + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long ip; + int failed; + int i; + + if (saved) + old = ftrace_nop_replace(); + else + new = ftrace_nop_replace(); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + /* don't modify code that has already faulted */ + if (rec->flags & FTRACE_FL_FAILED) + continue; + + ip = rec->ip; + + if (saved) + new = ftrace_call_replace(ip, MCOUNT_ADDR); + else + old = ftrace_call_replace(ip, MCOUNT_ADDR); + + failed = ftrace_modify_code(ip, old, new); + if (failed) + rec->flags |= FTRACE_FL_FAILED; + } + } +} + +static notrace void ftrace_startup_code(void) +{ + ftrace_replace_code(1); +} + +static notrace void ftrace_shutdown_code(void) +{ + ftrace_replace_code(0); +} + +static notrace void ftrace_shutdown_replenish(void) +{ + if (ftrace_pages->next) + return; + + /* allocate another page */ + ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); +} static int notrace __ftrace_modify_code(void *data) { @@ -261,6 +347,23 @@ static int notrace __ftrace_modify_code(void *data) return 0; } +static notrace void +ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip; + unsigned char *nop, *call; + int failed; + + ip = rec->ip; + + nop = ftrace_nop_replace(); + call = ftrace_call_replace(ip, addr); + + failed = ftrace_modify_code(ip, call, nop); + if (failed) + rec->flags |= FTRACE_FL_FAILED; +} + static void notrace ftrace_run_startup_code(void) { stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); @@ -346,7 +449,7 @@ static int notrace __ftrace_update_code(void *ignore) /* all CPUS are stopped, we are safe to modify code */ hlist_for_each_entry(p, t, &head, node) { - ftrace_code_disable(p); + ftrace_code_disable(p, MCOUNT_ADDR); ftrace_update_cnt++; } @@ -407,12 +510,59 @@ static int notrace ftraced(void *ignore) return 0; } +static int __init ftrace_dyn_table_alloc(void) +{ + struct ftrace_page *pg; + int cnt; + int i; + int ret; + + ret = ftrace_dyn_arch_init(); + if (ret) + return ret; + + /* allocate a few pages */ + ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages_start) + return -1; + + /* + * Allocate a few more pages. + * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: + * a) know how many pages to allocate. + * and/or + * b) set up the table then. + * + * The dynamic code is still necessary for + * modules. + */ + + pg = ftrace_pages = ftrace_pages_start; + + cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + + for (i = 0; i < cnt; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + + /* If we fail, we'll try later anyway */ + if (!pg->next) + break; + + pg = pg->next; + } + + return 0; +} + static int __init notrace ftrace_shutdown_init(void) { struct task_struct *p; int ret; - ret = ftrace_shutdown_arch_init(); + ret = ftrace_dyn_table_alloc(); if (ret) return ret; -- cgit v1.2.3-70-g09d2 From d61f82d06672f57fca410da6f7fffd15867db622 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: use dynamic patching for updating mcount calls This patch replaces the indirect call to the mcount function pointer with a direct call that will be patched by the dynamic ftrace routines. On boot up, the mcount function calls the ftace_stub function. When the dynamic ftrace code is initialized, the ftrace_stub is replaced with a call to the ftrace_record_ip, which records the instruction pointers of the locations that call it. Later, the ftraced daemon will call kstop_machine and patch all the locations to nops. When a ftrace is enabled, the original calls to mcount will now be set top call ftrace_caller, which will do a direct call to the registered ftrace function. This direct call is also patched when the function that should be called is updated. All patching is performed by a kstop_machine routine to prevent any type of race conditions that is associated with modifying code on the fly. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_32.S | 47 +++++++++++- arch/x86/kernel/entry_64.S | 67 ++++++++++++++++- arch/x86/kernel/ftrace.c | 41 +++++++++- include/linux/ftrace.h | 7 +- kernel/trace/ftrace.c | 183 ++++++++++++++++++++++++++------------------- 5 files changed, 261 insertions(+), 84 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f47b9b5440d..e6517ce0b82 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1110,10 +1110,50 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ #ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + +.globl mcount_call +mcount_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace - .globl ftrace_stub ftrace_stub: ret @@ -1126,7 +1166,7 @@ trace: movl 0xc(%esp), %eax movl 0x4(%ebp), %edx - call *ftrace_trace_function + call *ftrace_trace_function popl %edx popl %ecx @@ -1134,7 +1174,8 @@ trace: jmp ftrace_stub END(mcount) -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f046e0c6488..fe25e5febca 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,70 @@ .code64 #ifdef CONFIG_FTRACE +#ifdef CONFIG_DYNAMIC_FTRACE +ENTRY(mcount) + + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + +.globl mcount_call +mcount_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + + retq +END(mcount) + +ENTRY(ftrace_caller) + + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + +.globl ftrace_stub +ftrace_stub: + retq +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) cmpq $ftrace_stub, ftrace_trace_function jnz trace @@ -89,7 +153,8 @@ trace: jmp ftrace_stub END(mcount) -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FTRACE */ #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index b69795efa22..9f44623e007 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -109,10 +109,49 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return faulted; } -int __init ftrace_dyn_arch_init(void) +notrace int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[5], *new; + int ret; + + ip += CALL_BACK; + + memcpy(old, &ftrace_call, 5); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + + return ret; +} + +notrace int ftrace_mcount_set(unsigned long *data) +{ + unsigned long ip = (long)(&mcount_call); + unsigned long *addr = data; + unsigned char old[5], *new; + + /* ip is at the location, but modify code will subtact this */ + ip += CALL_BACK; + + /* + * Replace the mcount stub with a pointer to the + * ip recorder function. + */ + memcpy(old, &mcount_call, 5); + new = ftrace_call_replace(ip, *addr); + *addr = ftrace_modify_code(ip, old, new); + + return 0; +} + +int __init ftrace_dyn_arch_init(void *data) { const unsigned char *const *noptable = find_nop_table(); + /* This is running in kstop_machine */ + + ftrace_mcount_set(data); + ftrace_nop = (unsigned long *)noptable[CALL_BACK]; return 0; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index d509ad6c9cb..b0dd0093058 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -56,9 +56,14 @@ struct dyn_ftrace { extern int ftrace_ip_converted(unsigned long ip); extern unsigned char *ftrace_nop_replace(void); extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr); -extern int ftrace_dyn_arch_init(void); +extern int ftrace_dyn_arch_init(void *data); +extern int ftrace_mcount_set(unsigned long *data); extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code); +extern int ftrace_update_ftrace_func(ftrace_func_t func); +extern void ftrace_caller(void); +extern void ftrace_call(void); +extern void mcount_call(void); #endif #ifdef CONFIG_FRAME_POINTER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f6d9af3bf66..88544f9bc0e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -26,14 +26,8 @@ #include "trace.h" -#ifdef CONFIG_DYNAMIC_FTRACE -# define FTRACE_ENABLED_INIT 1 -#else -# define FTRACE_ENABLED_INIT 0 -#endif - -int ftrace_enabled = FTRACE_ENABLED_INIT; -static int last_ftrace_enabled = FTRACE_ENABLED_INIT; +int ftrace_enabled; +static int last_ftrace_enabled; static DEFINE_SPINLOCK(ftrace_lock); static DEFINE_MUTEX(ftrace_sysctl_lock); @@ -149,6 +143,14 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE +enum { + FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_ENABLE_MCOUNT = (1 << 3), + FTRACE_DISABLE_MCOUNT = (1 << 4), +}; + static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); @@ -199,12 +201,8 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) hlist_add_head(&node->node, &ftrace_hash[key]); } -static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) +static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { - /* If this was already converted, skip it */ - if (ftrace_ip_converted(ip)) - return NULL; - if (ftrace_pages->index == ENTRIES_PER_PAGE) { if (!ftrace_pages->next) return NULL; @@ -215,7 +213,7 @@ static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip) } static void notrace -ftrace_record_ip(unsigned long ip, unsigned long parent_ip) +ftrace_record_ip(unsigned long ip) { struct dyn_ftrace *node; unsigned long flags; @@ -223,6 +221,9 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) int resched; int atomic; + if (!ftrace_enabled) + return; + resched = need_resched(); preempt_disable_notrace(); @@ -251,11 +252,12 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) /* * There's a slight race that the ftraced will update the - * hash and reset here. The arch alloc is responsible - * for seeing if the IP has already changed, and if - * it has, the alloc will fail. + * hash and reset here. If it is already converted, skip it. */ - node = ftrace_alloc_shutdown_node(ip); + if (ftrace_ip_converted(ip)) + goto out_unlock; + + node = ftrace_alloc_dyn_node(ip); if (!node) goto out_unlock; @@ -277,11 +279,7 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } -static struct ftrace_ops ftrace_shutdown_ops __read_mostly = -{ - .func = ftrace_record_ip, -}; - +#define FTRACE_ADDR ((long)(&ftrace_caller)) #define MCOUNT_ADDR ((long)(&mcount)) static void notrace ftrace_replace_code(int saved) @@ -309,9 +307,9 @@ static void notrace ftrace_replace_code(int saved) ip = rec->ip; if (saved) - new = ftrace_call_replace(ip, MCOUNT_ADDR); + new = ftrace_call_replace(ip, FTRACE_ADDR); else - old = ftrace_call_replace(ip, MCOUNT_ADDR); + old = ftrace_call_replace(ip, FTRACE_ADDR); failed = ftrace_modify_code(ip, old, new); if (failed) @@ -320,16 +318,6 @@ static void notrace ftrace_replace_code(int saved) } } -static notrace void ftrace_startup_code(void) -{ - ftrace_replace_code(1); -} - -static notrace void ftrace_shutdown_code(void) -{ - ftrace_replace_code(0); -} - static notrace void ftrace_shutdown_replenish(void) { if (ftrace_pages->next) @@ -339,16 +327,8 @@ static notrace void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } -static int notrace __ftrace_modify_code(void *data) -{ - void (*func)(void) = data; - - func(); - return 0; -} - static notrace void -ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) +ftrace_code_disable(struct dyn_ftrace *rec) { unsigned long ip; unsigned char *nop, *call; @@ -357,67 +337,113 @@ ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr) ip = rec->ip; nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, addr); + call = ftrace_call_replace(ip, MCOUNT_ADDR); failed = ftrace_modify_code(ip, call, nop); if (failed) rec->flags |= FTRACE_FL_FAILED; } -static void notrace ftrace_run_startup_code(void) +static int notrace __ftrace_modify_code(void *data) { - stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS); + unsigned long addr; + int *command = data; + + if (*command & FTRACE_ENABLE_CALLS) + ftrace_replace_code(1); + else if (*command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (*command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (*command & FTRACE_ENABLE_MCOUNT) { + addr = (unsigned long)ftrace_record_ip; + ftrace_mcount_set(&addr); + } else if (*command & FTRACE_DISABLE_MCOUNT) { + addr = (unsigned long)ftrace_stub; + ftrace_mcount_set(&addr); + } + + return 0; } -static void notrace ftrace_run_shutdown_code(void) +static void notrace ftrace_run_update_code(int command) { - stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS); + stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); } +static ftrace_func_t saved_ftrace_func; + static void notrace ftrace_startup(void) { + int command = 0; + mutex_lock(&ftraced_lock); ftraced_suspend++; - if (ftraced_suspend != 1) + if (ftraced_suspend == 1) + command |= FTRACE_ENABLE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) goto out; - __unregister_ftrace_function(&ftrace_shutdown_ops); - if (ftrace_enabled) - ftrace_run_startup_code(); + ftrace_run_update_code(command); out: mutex_unlock(&ftraced_lock); } static void notrace ftrace_shutdown(void) { + int command = 0; + mutex_lock(&ftraced_lock); ftraced_suspend--; - if (ftraced_suspend) - goto out; + if (!ftraced_suspend) + command |= FTRACE_DISABLE_CALLS; - if (ftrace_enabled) - ftrace_run_shutdown_code(); + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } - __register_ftrace_function(&ftrace_shutdown_ops); + if (!command || !ftrace_enabled) + goto out; + + ftrace_run_update_code(command); out: mutex_unlock(&ftraced_lock); } static void notrace ftrace_startup_sysctl(void) { + int command = FTRACE_ENABLE_MCOUNT; + mutex_lock(&ftraced_lock); + /* Force update next time */ + saved_ftrace_func = NULL; /* ftraced_suspend is true if we want ftrace running */ if (ftraced_suspend) - ftrace_run_startup_code(); + command |= FTRACE_ENABLE_CALLS; + + ftrace_run_update_code(command); mutex_unlock(&ftraced_lock); } static void notrace ftrace_shutdown_sysctl(void) { + int command = FTRACE_DISABLE_MCOUNT; + mutex_lock(&ftraced_lock); /* ftraced_suspend is true if ftrace is running */ if (ftraced_suspend) - ftrace_run_shutdown_code(); + command |= FTRACE_DISABLE_CALLS; + + ftrace_run_update_code(command); mutex_unlock(&ftraced_lock); } @@ -430,11 +456,13 @@ static int notrace __ftrace_update_code(void *ignore) struct dyn_ftrace *p; struct hlist_head head; struct hlist_node *t; + int save_ftrace_enabled; cycle_t start, stop; int i; - /* Don't be calling ftrace ops now */ - __unregister_ftrace_function(&ftrace_shutdown_ops); + /* Don't be recording funcs now */ + save_ftrace_enabled = ftrace_enabled; + ftrace_enabled = 0; start = now(raw_smp_processor_id()); ftrace_update_cnt = 0; @@ -449,7 +477,7 @@ static int notrace __ftrace_update_code(void *ignore) /* all CPUS are stopped, we are safe to modify code */ hlist_for_each_entry(p, t, &head, node) { - ftrace_code_disable(p, MCOUNT_ADDR); + ftrace_code_disable(p); ftrace_update_cnt++; } @@ -459,7 +487,7 @@ static int notrace __ftrace_update_code(void *ignore) ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; - __register_ftrace_function(&ftrace_shutdown_ops); + ftrace_enabled = save_ftrace_enabled; return 0; } @@ -515,11 +543,6 @@ static int __init ftrace_dyn_table_alloc(void) struct ftrace_page *pg; int cnt; int i; - int ret; - - ret = ftrace_dyn_arch_init(); - if (ret) - return ret; /* allocate a few pages */ ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); @@ -557,11 +580,19 @@ static int __init ftrace_dyn_table_alloc(void) return 0; } -static int __init notrace ftrace_shutdown_init(void) +static int __init notrace ftrace_dynamic_init(void) { struct task_struct *p; + unsigned long addr; int ret; + addr = (unsigned long)ftrace_record_ip; + stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) + return addr; + ret = ftrace_dyn_table_alloc(); if (ret) return ret; @@ -570,12 +601,12 @@ static int __init notrace ftrace_shutdown_init(void) if (IS_ERR(p)) return -1; - __register_ftrace_function(&ftrace_shutdown_ops); + last_ftrace_enabled = ftrace_enabled = 1; return 0; } -core_initcall(ftrace_shutdown_init); +core_initcall(ftrace_dynamic_init); #else # define ftrace_startup() do { } while (0) # define ftrace_shutdown() do { } while (0) @@ -599,9 +630,8 @@ int register_ftrace_function(struct ftrace_ops *ops) int ret; mutex_lock(&ftrace_sysctl_lock); - ftrace_startup(); - ret = __register_ftrace_function(ops); + ftrace_startup(); mutex_unlock(&ftrace_sysctl_lock); return ret; @@ -619,10 +649,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_sysctl_lock); ret = __unregister_ftrace_function(ops); - - if (ftrace_list == &ftrace_list_end) - ftrace_shutdown(); - + ftrace_shutdown(); mutex_unlock(&ftrace_sysctl_lock); return ret; -- cgit v1.2.3-70-g09d2 From 5072c59fd45e9976d02ee6f18c7336ef97623cbc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: add filter select functions to trace This patch adds two files to the debugfs system: /debugfs/tracing/available_filter_functions and /debugfs/tracing/set_ftrace_filter The available_filter_functions lists all functions that has been recorded by the ftraced that has called the ftrace_record_ip function. This is to allow users to see what functions have been converted to nops and can be enabled for tracing. To enable functions, simply echo the names (whitespace delimited) into set_ftrace_filter. Simple wildcards are also allowed. echo 'scheduler' > /debugfs/tracing/set_ftrace_filter Will have only the scheduler be activated when tracing is enabled. echo 'sched_*' > /debugfs/tracing/set_ftrace_filter Will have only the functions starting with 'sched_' be activated. echo '*lock' > /debugfs/tracing/set_ftrace_filter Will have only functions ending with 'lock' be activated. echo '*lock*' > /debugfs/tracing/set_ftrace_filter Will have only functions with 'lock' in its name be activated. Note: 'sched*lock' will not work. The only wildcards that are allowed is an asterisk and the beginning and or end of the string passed in. Multiple names can be passed in with whitespace delimited: echo 'scheduler *lock *acpi*' > /debugfs/tracing/set_ftrace_filter is also the same as: echo 'scheduler' > /debugfs/tracing/set_ftrace_filter echo '*lock' >> /debugfs/tracing/set_ftrace_filter echo '*acpi*' >> /debugfs/tracing/set_ftrace_filter Appending does just that. It appends to the list. To disable all filters simply echo an empty line in: echo > /debugfs/tracing/set_ftrace_filter Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 4 +- kernel/trace/ftrace.c | 527 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 513 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b0dd0093058..f5911d2d42c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -43,7 +43,9 @@ extern void mcount(void); # define FTRACE_HASHSIZE (1< #include #include +#include +#include #include #include #include -#include +#include #include #include +#include #include #include "trace.h" @@ -151,12 +154,15 @@ enum { FTRACE_DISABLE_MCOUNT = (1 << 4), }; +static int ftrace_filtered; + static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); static DEFINE_SPINLOCK(ftrace_shutdown_lock); static DEFINE_MUTEX(ftraced_lock); +static DEFINE_MUTEX(ftrace_filter_lock); struct ftrace_page { struct ftrace_page *next; @@ -282,16 +288,82 @@ ftrace_record_ip(unsigned long ip) #define FTRACE_ADDR ((long)(&ftrace_caller)) #define MCOUNT_ADDR ((long)(&mcount)) -static void notrace ftrace_replace_code(int saved) +static void notrace +__ftrace_replace_code(struct dyn_ftrace *rec, + unsigned char *old, unsigned char *new, int enable) +{ + unsigned long ip; + int failed; + + ip = rec->ip; + + if (ftrace_filtered && enable) { + unsigned long fl; + /* + * If filtering is on: + * + * If this record is set to be filtered and + * is enabled then do nothing. + * + * If this record is set to be filtered and + * it is not enabled, enable it. + * + * If this record is not set to be filtered + * and it is not enabled do nothing. + * + * If this record is not set to be filtered and + * it is enabled, disable it. + */ + fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); + + if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || + (fl == 0)) + return; + + /* + * If it is enabled disable it, + * otherwise enable it! + */ + if (fl == FTRACE_FL_ENABLED) { + /* swap new and old */ + new = old; + old = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags &= ~FTRACE_FL_ENABLED; + } else { + new = ftrace_call_replace(ip, FTRACE_ADDR); + rec->flags |= FTRACE_FL_ENABLED; + } + } else { + + if (enable) + new = ftrace_call_replace(ip, FTRACE_ADDR); + else + old = ftrace_call_replace(ip, FTRACE_ADDR); + + if (enable) { + if (rec->flags & FTRACE_FL_ENABLED) + return; + rec->flags |= FTRACE_FL_ENABLED; + } else { + if (!(rec->flags & FTRACE_FL_ENABLED)) + return; + rec->flags &= ~FTRACE_FL_ENABLED; + } + } + + failed = ftrace_modify_code(ip, old, new); + if (failed) + rec->flags |= FTRACE_FL_FAILED; +} + +static void notrace ftrace_replace_code(int enable) { unsigned char *new = NULL, *old = NULL; struct dyn_ftrace *rec; struct ftrace_page *pg; - unsigned long ip; - int failed; int i; - if (saved) + if (enable) old = ftrace_nop_replace(); else new = ftrace_nop_replace(); @@ -304,16 +376,7 @@ static void notrace ftrace_replace_code(int saved) if (rec->flags & FTRACE_FL_FAILED) continue; - ip = rec->ip; - - if (saved) - new = ftrace_call_replace(ip, FTRACE_ADDR); - else - old = ftrace_call_replace(ip, FTRACE_ADDR); - - failed = ftrace_modify_code(ip, old, new); - if (failed) - rec->flags |= FTRACE_FL_FAILED; + __ftrace_replace_code(rec, old, new, enable); } } } @@ -580,6 +643,436 @@ static int __init ftrace_dyn_table_alloc(void) return 0; } +enum { + FTRACE_ITER_FILTER = (1 << 0), + FTRACE_ITER_CONT = (1 << 1), +}; + +#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ + +struct ftrace_iterator { + loff_t pos; + struct ftrace_page *pg; + unsigned idx; + unsigned flags; + unsigned char buffer[FTRACE_BUFF_MAX+1]; + unsigned buffer_idx; + unsigned filtered; +}; + +static void notrace * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct dyn_ftrace *rec = NULL; + + (*pos)++; + + retry: + if (iter->idx >= iter->pg->index) { + if (iter->pg->next) { + iter->pg = iter->pg->next; + iter->idx = 0; + goto retry; + } + } else { + rec = &iter->pg->records[iter->idx++]; + if ((rec->flags & FTRACE_FL_FAILED) || + ((iter->flags & FTRACE_ITER_FILTER) && + !(rec->flags & FTRACE_FL_FILTER))) { + rec = NULL; + goto retry; + } + } + + iter->pos = *pos; + + return rec; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l = -1; + + if (*pos != iter->pos) { + for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) + ; + } else { + l = *pos; + p = t_next(m, p, &l); + } + + return p; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int t_show(struct seq_file *m, void *v) +{ + struct dyn_ftrace *rec = v; + char str[KSYM_SYMBOL_LEN]; + + if (!rec) + return 0; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, "%s\n", str); + + return 0; +} + +static struct seq_operations show_ftrace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int notrace +ftrace_avail_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->pos = -1; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else + kfree(iter); + + return ret; +} + +int ftrace_avail_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter = m->private; + + seq_release(inode, file); + kfree(iter); + return 0; +} + +static void notrace ftrace_filter_reset(void) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + unsigned i; + + /* keep kstop machine from running */ + preempt_disable(); + ftrace_filtered = 0; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + rec->flags &= ~FTRACE_FL_FILTER; + } + pg = pg->next; + } + preempt_enable(); +} + +static int notrace +ftrace_filter_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret = 0; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + mutex_lock(&ftrace_filter_lock); + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) + ftrace_filter_reset(); + + if (file->f_mode & FMODE_READ) { + iter->pg = ftrace_pages_start; + iter->pos = -1; + iter->flags = FTRACE_ITER_FILTER; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else + kfree(iter); + } else + file->private_data = iter; + mutex_unlock(&ftrace_filter_lock); + + return ret; +} + +static ssize_t notrace +ftrace_filter_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + if (file->f_mode & FMODE_READ) + return seq_read(file, ubuf, cnt, ppos); + else + return -EPERM; +} + +static loff_t notrace +ftrace_filter_lseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, origin); + else + file->f_pos = ret = 1; + + return ret; +} + +enum { + MATCH_FULL, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +static void notrace +ftrace_match(unsigned char *buff, int len) +{ + char str[KSYM_SYMBOL_LEN]; + char *search = NULL; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type = MATCH_FULL; + unsigned i, match = 0, search_len = 0; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + search = buff + i + 1; + type = MATCH_END_ONLY; + search_len = len - (i + 1); + } else { + if (type == MATCH_END_ONLY) { + type = MATCH_MIDDLE_ONLY; + } else { + match = i; + type = MATCH_FRONT_ONLY; + } + buff[i] = 0; + break; + } + } + } + + /* keep kstop machine from running */ + preempt_disable(); + ftrace_filtered = 1; + pg = ftrace_pages_start; + while (pg) { + for (i = 0; i < pg->index; i++) { + int matched = 0; + char *ptr; + + rec = &pg->records[i]; + if (rec->flags & FTRACE_FL_FAILED) + continue; + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + switch (type) { + case MATCH_FULL: + if (strcmp(str, buff) == 0) + matched = 1; + break; + case MATCH_FRONT_ONLY: + if (memcmp(str, buff, match) == 0) + matched = 1; + break; + case MATCH_MIDDLE_ONLY: + if (strstr(str, search)) + matched = 1; + break; + case MATCH_END_ONLY: + ptr = strstr(str, search); + if (ptr && (ptr[search_len] == 0)) + matched = 1; + break; + } + if (matched) + rec->flags |= FTRACE_FL_FILTER; + } + pg = pg->next; + } + preempt_enable(); +} + +static ssize_t notrace +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ftrace_iterator *iter; + char ch; + size_t read = 0; + ssize_t ret; + + if (!cnt || cnt < 0) + return 0; + + mutex_lock(&ftrace_filter_lock); + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + iter = m->private; + } else + iter = file->private_data; + + if (!*ppos) { + iter->flags &= ~FTRACE_ITER_CONT; + iter->buffer_idx = 0; + } + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + + if (!(iter->flags & ~FTRACE_ITER_CONT)) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + + if (isspace(ch)) { + file->f_pos += read; + ret = read; + goto out; + } + + iter->buffer_idx = 0; + } + + while (cnt && !isspace(ch)) { + if (iter->buffer_idx < FTRACE_BUFF_MAX) + iter->buffer[iter->buffer_idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + if (isspace(ch)) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx); + iter->buffer_idx = 0; + } else + iter->flags |= FTRACE_ITER_CONT; + + + file->f_pos += read; + + ret = read; + out: + mutex_unlock(&ftrace_filter_lock); + + return ret; +} + +static int notrace +ftrace_filter_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter; + + mutex_lock(&ftrace_filter_lock); + if (file->f_mode & FMODE_READ) { + iter = m->private; + + seq_release(inode, file); + } else + iter = file->private_data; + + if (iter->buffer_idx) { + iter->filtered++; + iter->buffer[iter->buffer_idx] = 0; + ftrace_match(iter->buffer, iter->buffer_idx); + } + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (iter->filtered && ftraced_suspend && ftrace_enabled) + ftrace_run_update_code(FTRACE_ENABLE_CALLS); + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + kfree(iter); + mutex_unlock(&ftrace_filter_lock); + return 0; +} + +static struct file_operations ftrace_avail_fops = { + .open = ftrace_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_avail_release, +}; + +static struct file_operations ftrace_filter_fops = { + .open = ftrace_filter_open, + .read = ftrace_filter_read, + .write = ftrace_filter_write, + .llseek = ftrace_filter_lseek, + .release = ftrace_filter_release, +}; + +static __init int ftrace_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'available_filter_functions' entry\n"); + + entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_filter' entry\n"); + return 0; +} + +fs_initcall(ftrace_init_debugfs); + static int __init notrace ftrace_dynamic_init(void) { struct task_struct *p; @@ -657,14 +1150,14 @@ int unregister_ftrace_function(struct ftrace_ops *ops) notrace int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&ftrace_sysctl_lock); - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, file, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) goto out; -- cgit v1.2.3-70-g09d2 From f43fdad8627fec2d21df92799b254dceb66c9c3c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:43 +0200 Subject: ftrace: fix kexec disable the tracer while kexec pulls the rug from under the old kernel. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/machine_kexec_32.c | 4 ++++ arch/x86/kernel/machine_kexec_64.c | 4 ++++ include/linux/ftrace.h | 7 +++++++ 3 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index d0b234c9fc3..88923fd7a6f 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -11,6 +11,8 @@ #include #include #include +#include + #include #include #include @@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) unsigned long page_list[PAGES_NR]; void *control_page; + tracer_disable(); + /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 576a03db451..1558fdc174f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -11,6 +11,8 @@ #include #include #include +#include + #include #include #include @@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image) unsigned long page_list[PAGES_NR]; void *control_page; + tracer_disable(); + /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f5911d2d42c..a42390c1d6e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -68,6 +68,13 @@ extern void ftrace_call(void); extern void mcount_call(void); #endif +static inline void tracer_disable(void) +{ +#ifdef CONFIG_FTRACE + ftrace_enabled = 0; +#endif +} + #ifdef CONFIG_FRAME_POINTER /* TODO: need to fix this for ARM */ # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -- cgit v1.2.3-70-g09d2 From e1c08bdd9fa73e44096e5a82c0d5928b04ab02c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:44 +0200 Subject: ftrace: force recording Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 4 ++++ kernel/trace/ftrace.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a42390c1d6e..2c1670c6523 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -54,6 +54,8 @@ struct dyn_ftrace { unsigned long flags; }; +int ftrace_force_update(void); + /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern unsigned char *ftrace_nop_replace(void); @@ -66,6 +68,8 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); +#else +# define ftrace_force_update() do { } while (0) #endif static inline void tracer_disable(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 97d5cb7b7e7..4facf5ceeb8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -146,6 +146,10 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE +static struct task_struct *ftraced_task; +static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters); +static unsigned long ftraced_iteration_counter; + enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -590,9 +594,12 @@ static int notrace ftraced(void *ignore) ftraced_trigger = 0; ftrace_record_suspend--; } + ftraced_iteration_counter++; mutex_unlock(&ftraced_lock); mutex_unlock(&ftrace_sysctl_lock); + wake_up_interruptible(&ftraced_waiters); + ftrace_shutdown_replenish(); set_current_state(TASK_INTERRUPTIBLE); @@ -1050,6 +1057,49 @@ static struct file_operations ftrace_filter_fops = { .release = ftrace_filter_release, }; +/** + * ftrace_force_update - force an update to all recording ftrace functions + * + * The ftrace dynamic update daemon only wakes up once a second. + * There may be cases where an update needs to be done immediately + * for tests or internal kernel tracing to begin. This function + * wakes the daemon to do an update and will not return until the + * update is complete. + */ +int ftrace_force_update(void) +{ + unsigned long last_counter; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + if (!ftraced_task) + return -ENODEV; + + mutex_lock(&ftraced_lock); + last_counter = ftraced_iteration_counter; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&ftraced_waiters, &wait); + + do { + mutex_unlock(&ftraced_lock); + wake_up_process(ftraced_task); + schedule(); + mutex_lock(&ftraced_lock); + if (signal_pending(current)) { + ret = -EINTR; + break; + } + set_current_state(TASK_INTERRUPTIBLE); + } while (last_counter == ftraced_iteration_counter); + + mutex_unlock(&ftraced_lock); + remove_wait_queue(&ftraced_waiters, &wait); + set_current_state(TASK_RUNNING); + + return ret; +} + static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; @@ -1095,6 +1145,7 @@ static int __init notrace ftrace_dynamic_init(void) return -1; last_ftrace_enabled = ftrace_enabled = 1; + ftraced_task = p; return 0; } -- cgit v1.2.3-70-g09d2 From c7aafc549766b87819285d3480648fc652a47bc4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: cleanups factor out code and clean it up. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 2 +- kernel/trace/ftrace.c | 8 +-- kernel/trace/trace.c | 144 ++++++++++++++++++++++++-------------- kernel/trace/trace.h | 8 ++- kernel/trace/trace_irqsoff.c | 32 ++++----- kernel/trace/trace_sched_wakeup.c | 18 ++--- kernel/trace/trace_selftest.c | 25 ++++--- 7 files changed, 134 insertions(+), 103 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2c1670c6523..953a36d6a19 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -69,7 +69,7 @@ extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); #else -# define ftrace_force_update() do { } while (0) +# define ftrace_force_update() ({ 0; }) #endif static inline void tracer_disable(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4facf5ceeb8..6d4d2e86deb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1152,10 +1152,10 @@ static int __init notrace ftrace_dynamic_init(void) core_initcall(ftrace_dynamic_init); #else -# define ftrace_startup() do { } while (0) -# define ftrace_shutdown() do { } while (0) -# define ftrace_startup_sysctl() do { } while (0) -# define ftrace_shutdown_sysctl() do { } while (0) +# define ftrace_startup() do { } while (0) +# define ftrace_shutdown() do { } while (0) +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f6d026f17db..61d2f022886 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -142,12 +142,59 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } +void check_pages(struct trace_array_cpu *data) +{ + struct page *page, *tmp; + + BUG_ON(data->trace_pages.next->prev != &data->trace_pages); + BUG_ON(data->trace_pages.prev->next != &data->trace_pages); + + list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { + BUG_ON(page->lru.next->prev != &page->lru); + BUG_ON(page->lru.prev->next != &page->lru); + } +} + +void *head_page(struct trace_array_cpu *data) +{ + struct page *page; + + check_pages(data); + if (list_empty(&data->trace_pages)) + return NULL; + + page = list_entry(data->trace_pages.next, struct page, lru); + BUG_ON(&page->lru == &data->trace_pages); + + return page_address(page); +} + +notrace static void +flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) +{ + struct list_head flip_pages; + + INIT_LIST_HEAD(&flip_pages); + + tr1->trace_current = NULL; + memcpy(&tr1->trace_current_idx, &tr2->trace_current_idx, + sizeof(struct trace_array_cpu) - + offsetof(struct trace_array_cpu, trace_current_idx)); + + check_pages(tr1); + check_pages(tr2); + list_splice_init(&tr1->trace_pages, &flip_pages); + list_splice_init(&tr2->trace_pages, &tr1->trace_pages); + list_splice_init(&flip_pages, &tr2->trace_pages); + BUG_ON(!list_empty(&flip_pages)); + check_pages(tr1); + check_pages(tr2); +} + notrace void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data; - void *save_trace; - struct list_head save_pages; int i; WARN_ON_ONCE(!irqs_disabled()); @@ -155,11 +202,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) /* clear out all the previous traces */ for_each_possible_cpu(i) { data = tr->data[i]; - save_trace = max_tr.data[i]->trace; - save_pages = max_tr.data[i]->trace_pages; - memcpy(max_tr.data[i], data, sizeof(*data)); - data->trace = save_trace; - data->trace_pages = save_pages; + flip_trace(max_tr.data[i], data); tracing_reset(data); } @@ -177,8 +220,6 @@ notrace void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; - void *save_trace; - struct list_head save_pages; int i; WARN_ON_ONCE(!irqs_disabled()); @@ -186,11 +227,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) for_each_possible_cpu(i) tracing_reset(max_tr.data[i]); - save_trace = max_tr.data[cpu]->trace; - save_pages = max_tr.data[cpu]->trace_pages; - memcpy(max_tr.data[cpu], data, sizeof(*data)); - data->trace = save_trace; - data->trace_pages = save_pages; + flip_trace(max_tr.data[cpu], data); + tracing_reset(data); __update_max_tr(tr, tsk, cpu); @@ -234,9 +272,9 @@ int register_tracer(struct tracer *type) * If we fail, we do not register this tracer. */ for_each_possible_cpu(i) { - if (!data->trace) - continue; data = tr->data[i]; + if (!head_page(data)) + continue; tracing_reset(data); } current_trace = type; @@ -298,7 +336,7 @@ void unregister_tracer(struct tracer *type) void notrace tracing_reset(struct trace_array_cpu *data) { data->trace_idx = 0; - data->trace_current = data->trace; + data->trace_current = head_page(data); data->trace_current_idx = 0; } @@ -425,26 +463,31 @@ notrace void tracing_record_cmdline(struct task_struct *tsk) } static inline notrace struct trace_entry * -tracing_get_trace_entry(struct trace_array *tr, - struct trace_array_cpu *data) +tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) { unsigned long idx, idx_next; struct trace_entry *entry; - struct page *page; struct list_head *next; + struct page *page; data->trace_idx++; idx = data->trace_current_idx; idx_next = idx + 1; + BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); + entry = data->trace_current + idx * TRACE_ENTRY_SIZE; if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { page = virt_to_page(data->trace_current); - if (unlikely(&page->lru == data->trace_pages.prev)) - next = data->trace_pages.next; - else - next = page->lru.next; + /* + * Roundrobin - but skip the head (which is not a real page): + */ + next = page->lru.next; + if (unlikely(next == &data->trace_pages)) + next = next->next; + BUG_ON(next == &data->trace_pages); + page = list_entry(next, struct page, lru); data->trace_current = page_address(page); idx_next = 0; @@ -456,18 +499,17 @@ tracing_get_trace_entry(struct trace_array *tr, } static inline notrace void -tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags) +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) { struct task_struct *tsk = current; unsigned long pc; pc = preempt_count(); - entry->idx = atomic_inc_return(&tracer_counter); - entry->preempt_count = pc & 0xff; - entry->pid = tsk->pid; - entry->t = now(raw_smp_processor_id()); + entry->idx = atomic_inc_return(&tracer_counter); + entry->preempt_count = pc & 0xff; + entry->pid = tsk->pid; + entry->t = now(raw_smp_processor_id()); entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | @@ -476,16 +518,15 @@ tracing_generic_entry_update(struct trace_entry *entry, notrace void ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, - unsigned long flags) + unsigned long ip, unsigned long parent_ip, unsigned long flags) { struct trace_entry *entry; - entry = tracing_get_trace_entry(tr, data); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); - entry->type = TRACE_FN; - entry->fn.ip = ip; - entry->fn.parent_ip = parent_ip; + entry->type = TRACE_FN; + entry->fn.ip = ip; + entry->fn.parent_ip = parent_ip; } notrace void @@ -496,7 +537,7 @@ tracing_sched_switch_trace(struct trace_array *tr, { struct trace_entry *entry; - entry = tracing_get_trace_entry(tr, data); + entry = tracing_get_trace_entry(tr, data); tracing_generic_entry_update(entry, flags); entry->type = TRACE_CTX; entry->ctx.prev_pid = prev->pid; @@ -540,6 +581,8 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, } page = list_entry(iter->next_page[cpu], struct page, lru); + BUG_ON(&data->trace_pages == &page->lru); + array = page_address(page); return &array[iter->next_page_idx[cpu]]; @@ -554,7 +597,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu) int cpu; for_each_possible_cpu(cpu) { - if (!tr->data[cpu]->trace) + if (!head_page(tr->data[cpu])) continue; ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); if (ent && @@ -762,7 +805,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) name = type->name; for_each_possible_cpu(cpu) { - if (tr->data[cpu]->trace) { + if (head_page(tr->data[cpu])) { total += tr->data[cpu]->trace_idx; if (tr->data[cpu]->trace_idx > tr->entries) entries += tr->entries; @@ -975,8 +1018,7 @@ static int trace_empty(struct trace_iterator *iter) for_each_possible_cpu(cpu) { data = iter->tr->data[cpu]; - if (data->trace && - data->trace_idx) + if (head_page(data) && data->trace_idx) return 0; } return 1; @@ -1576,9 +1618,9 @@ static struct tracer no_tracer __read_mostly = static int trace_alloc_page(void) { struct trace_array_cpu *data; - void *array; struct page *page, *tmp; LIST_HEAD(pages); + void *array; int i; /* first allocate a page for each CPU */ @@ -1610,14 +1652,14 @@ static int trace_alloc_page(void) for_each_possible_cpu(i) { data = global_trace.data[i]; page = list_entry(pages.next, struct page, lru); - list_del(&page->lru); + list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); ClearPageLRU(page); #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; page = list_entry(pages.next, struct page, lru); - list_del(&page->lru); + list_del_init(&page->lru); list_add_tail(&page->lru, &data->trace_pages); SetPageLRU(page); #endif @@ -1628,7 +1670,7 @@ static int trace_alloc_page(void) free_pages: list_for_each_entry_safe(page, tmp, &pages, lru) { - list_del(&page->lru); + list_del_init(&page->lru); __free_page(page); } return -ENOMEM; @@ -1654,7 +1696,6 @@ __init static int tracer_alloc_buffers(void) "for trace buffer!\n"); goto free_buffers; } - data->trace = array; /* set the array to the list */ INIT_LIST_HEAD(&data->trace_pages); @@ -1671,7 +1712,6 @@ __init static int tracer_alloc_buffers(void) "for trace buffer!\n"); goto free_buffers; } - max_tr.data[i]->trace = array; INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); page = virt_to_page(array); @@ -1716,24 +1756,22 @@ __init static int tracer_alloc_buffers(void) struct page *page, *tmp; struct trace_array_cpu *data = global_trace.data[i]; - if (data && data->trace) { + if (data) { list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - list_del(&page->lru); + list_del_init(&page->lru); __free_page(page); } - data->trace = NULL; } #ifdef CONFIG_TRACER_MAX_TRACE data = max_tr.data[i]; - if (data && data->trace) { + if (data) { list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - list_del(&page->lru); + list_del_init(&page->lru); __free_page(page); } - data->trace = NULL; } #endif } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 88edbf1f678..cc1d34b8b77 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -53,12 +53,12 @@ struct trace_entry { * the trace, etc.) */ struct trace_array_cpu { - void *trace; void *trace_current; - unsigned trace_current_idx; struct list_head trace_pages; - unsigned long trace_idx; atomic_t disabled; + /* these fields get copied into max-trace: */ + unsigned trace_current_idx; + unsigned long trace_idx; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -216,4 +216,6 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, #endif #endif /* CONFIG_FTRACE_STARTUP_TEST */ +extern void *head_page(struct trace_array_cpu *data); + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 14183b8f79c..2dfebb67fdf 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -144,7 +144,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out; - spin_lock(&max_trace_lock); + spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ if (!report_latency(delta)) @@ -165,32 +165,24 @@ check_critical_timing(struct trace_array *tr, update_max_tr_single(tr, current, cpu); - if (tracing_thresh) - printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section " - "violates %lu us threshold.\n" - " => started at timestamp %lu: ", + if (tracing_thresh) { + printk(KERN_INFO "(%16s-%-5d|#%d):" + " %lu us critical section violates %lu us threshold.\n", current->comm, current->pid, raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh), t0); - else + latency, nsecs_to_usecs(tracing_thresh)); + } else { printk(KERN_INFO "(%16s-%-5d|#%d):" - " new %lu us maximum-latency " - "critical section.\n => started at timestamp %lu: ", + " new %lu us maximum-latency critical section.\n", current->comm, current->pid, raw_smp_processor_id(), - latency, t0); - - print_symbol(KERN_CONT "<%s>\n", data->critical_start); - printk(KERN_CONT " => ended at timestamp %lu: ", t1); - print_symbol(KERN_CONT "<%s>\n", data->critical_end); - dump_stack(); - t1 = nsecs_to_usecs(now(cpu)); - printk(KERN_CONT " => dump-end timestamp %lu\n\n", t1); + latency); + } max_sequence++; out_unlock: - spin_unlock(&max_trace_lock); + spin_unlock_irqrestore(&max_trace_lock, flags); out: data->critical_sequence = max_sequence; @@ -216,7 +208,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(!data) || unlikely(!data->trace) || + if (unlikely(!data) || unlikely(!head_page(data)) || atomic_read(&data->disabled)) return; @@ -256,7 +248,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(!data) || unlikely(!data->trace) || + if (unlikely(!data) || unlikely(!head_page(data)) || !data->critical_start || atomic_read(&data->disabled)) return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3d10ff01f80..688df965f3f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -107,24 +107,18 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) update_max_tr(tr, wakeup_task, wakeup_cpu); if (tracing_thresh) { - printk(KERN_INFO "(%16s-%-5d|#%d): %lu us wakeup latency " - "violates %lu us threshold.\n" - " => started at timestamp %lu: ", + printk(KERN_INFO "(%16s-%-5d|#%d):" + " %lu us wakeup latency violates %lu us threshold.\n", wakeup_task->comm, wakeup_task->pid, raw_smp_processor_id(), - latency, nsecs_to_usecs(tracing_thresh), t0); + latency, nsecs_to_usecs(tracing_thresh)); } else { - printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us maximum " - "wakeup latency.\n => started at timestamp %lu: ", + printk(KERN_INFO "(%16s-%-5d|#%d):" + " new %lu us maximum wakeup latency.\n", wakeup_task->comm, wakeup_task->pid, - cpu, latency, t0); + cpu, latency); } - printk(KERN_CONT " ended at timestamp %lu: ", t1); - dump_stack(); - t1 = nsecs_to_usecs(now(cpu)); - printk(KERN_CONT " dump-end timestamp %lu\n\n", t1); - out_unlock: __wakeup_reset(tr); spin_unlock_irqrestore(&wakeup_lock, flags); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index ef4d3cc009f..c01874c3b1f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1,6 +1,7 @@ /* Include in trace.c */ #include +#include static inline int trace_valid_entry(struct trace_entry *entry) { @@ -15,28 +16,29 @@ static inline int trace_valid_entry(struct trace_entry *entry) static int trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) { - struct page *page; struct trace_entry *entries; + struct page *page; int idx = 0; int i; + BUG_ON(list_empty(&data->trace_pages)); page = list_entry(data->trace_pages.next, struct page, lru); entries = page_address(page); - if (data->trace != entries) + if (head_page(data) != entries) goto failed; /* * The starting trace buffer always has valid elements, - * if any element exits. + * if any element exists. */ - entries = data->trace; + entries = head_page(data); for (i = 0; i < tr->entries; i++) { - if (i < data->trace_idx && - !trace_valid_entry(&entries[idx])) { - printk(KERN_CONT ".. invalid entry %d ", entries[idx].type); + if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) { + printk(KERN_CONT ".. invalid entry %d ", + entries[idx].type); goto failed; } @@ -80,11 +82,10 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) int ret = 0; for_each_possible_cpu(cpu) { - if (!tr->data[cpu]->trace) + if (!head_page(tr->data[cpu])) continue; cnt += tr->data[cpu]->trace_idx; - printk("%d: count = %ld\n", cpu, cnt); ret = trace_test_buffer_cpu(tr, tr->data[cpu]); if (ret) @@ -117,6 +118,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) } /* start the tracing */ + ftrace_enabled = 1; + tr->ctrl = 1; trace->init(tr); /* Sleep for a 1/10 of a second */ @@ -124,6 +127,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tr->ctrl = 0; trace->ctrl_update(tr); + ftrace_enabled = 0; + /* check the trace buffer */ ret = trace_test_buffer(tr, &count); trace->reset(tr); @@ -328,7 +333,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* create a high prio thread */ p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); - if (!IS_ERR(p)) { + if (IS_ERR(p)) { printk(KERN_CONT "Failed to create ftrace wakeup test thread "); return -1; } -- cgit v1.2.3-70-g09d2 From 77a2b37d227483fe52aead242652aee406c25bf0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:45 +0200 Subject: ftrace: startup tester on dynamic tracing. This patch adds a startup self test on dynamic code modification and filters. The test filters on a specific function, makes sure that no other function is traced, exectutes the function, then makes sure that the function is traced. This patch also fixes a slight bug with the ftrace selftest, where tracer_enabled was not being set. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 2 + kernel/trace/ftrace.c | 19 +++++++ kernel/trace/trace_selftest.c | 113 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 130 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 953a36d6a19..a842d96c634 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -55,6 +55,7 @@ struct dyn_ftrace { }; int ftrace_force_update(void); +void ftrace_set_filter(unsigned char *buf, int len, int reset); /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); @@ -70,6 +71,7 @@ extern void ftrace_call(void); extern void mcount_call(void); #else # define ftrace_force_update() ({ 0; }) +# define ftrace_set_filter(buf, len, reset) do { } while (0) #endif static inline void tracer_disable(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6d4d2e86deb..5e9389faaf7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1010,6 +1010,25 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, return ret; } +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +notrace void ftrace_set_filter(unsigned char *buf, int len, int reset) +{ + mutex_lock(&ftrace_filter_lock); + if (reset) + ftrace_filter_reset(); + if (buf) + ftrace_match(buf, len); + mutex_unlock(&ftrace_filter_lock); +} + static int notrace ftrace_filter_release(struct inode *inode, struct file *file) { diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index c01874c3b1f..4c8a1b2d823 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -99,6 +99,100 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) } #ifdef CONFIG_FTRACE + +#ifdef CONFIG_DYNAMIC_FTRACE + +#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +#define __STR(x) #x +#define STR(x) __STR(x) +static int DYN_FTRACE_TEST_NAME(void) +{ + /* used to call mcount */ + return 0; +} + +/* Test dynamic code modification and ftrace filters */ +int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) +{ + unsigned long count; + int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + + /* The ftrace test PASSED */ + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace: "); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* passed in by parameter to fool gcc from optimizing */ + func(); + + /* update the records */ + ret = ftrace_force_update(); + if (ret) { + printk(KERN_CONT ".. ftraced failed .. "); + return ret; + } + + /* filter only on our function */ + ftrace_set_filter(STR(DYN_FTRACE_TEST_NAME), + sizeof(STR(DYN_FTRACE_TEST_NAME)), 1); + + /* enable tracing */ + tr->ctrl = 1; + trace->init(tr); + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* we should have nothing in the buffer */ + ret = trace_test_buffer(tr, &count); + if (ret) + goto out; + + if (count) { + ret = -1; + printk(KERN_CONT ".. filter did not filter .. "); + goto out; + } + + /* call our function again */ + func(); + + /* sleep again */ + msleep(100); + + /* stop the tracing. */ + tr->ctrl = 0; + trace->ctrl_update(tr); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + + /* we should only have one item */ + if (!ret && count != 1) { + printk(KERN_CONT ".. filter failed .."); + ret = -1; + goto out; + } + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + /* Enable tracing on all functions again */ + ftrace_set_filter(NULL, 0, 1); + + return ret; +} +#else +# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +#endif /* CONFIG_DYNAMIC_FTRACE */ /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -109,8 +203,13 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { unsigned long count; int ret; + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; - /* make sure functions have been recorded */ + /* make sure msleep has been recorded */ + msleep(1); + + /* force the recorded functions to be traced */ ret = ftrace_force_update(); if (ret) { printk(KERN_CONT ".. ftraced failed .. "); @@ -119,6 +218,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* start the tracing */ ftrace_enabled = 1; + tracer_enabled = 1; tr->ctrl = 1; trace->init(tr); @@ -136,8 +236,16 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); ret = -1; + goto out; } + ret = trace_selftest_startup_dynamic_tracing(trace, tr, + DYN_FTRACE_TEST_NAME); + + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + return ret; } #endif /* CONFIG_FTRACE */ @@ -415,6 +523,3 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr return ret; } #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ - -#ifdef CONFIG_DYNAMIC_FTRACE -#endif /* CONFIG_DYNAMIC_FTRACE */ -- cgit v1.2.3-70-g09d2 From 37ad508419f0fdfda7b378756eb1f35cfd26d96d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace - fix dynamic ftrace memory leak The ftrace dynamic function update allocates a record to store the instruction pointers that are being modified. If the modified instruction pointer fails to update, then the record is marked as failed and nothing more is done. Worse, if the modification fails, but the record ip function is still called, it will allocate a new record and try again. In just a matter of time, will this cause a serious memory leak and crash the system. This patch plugs this memory leak. When a record fails, it is included back into the pool of records to be used. Now a record may fail over and over again, but the number of allocated records will not increase. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 7 ++++--- kernel/trace/ftrace.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a842d96c634..61e757bd235 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -43,9 +43,10 @@ extern void mcount(void); # define FTRACE_HASHSIZE (1<node, &ftrace_hash[key]); } +static notrace void ftrace_free_rec(struct dyn_ftrace *rec) +{ + /* no locking, only called from kstop_machine */ + + rec->ip = (unsigned long)ftrace_free_records; + ftrace_free_records = rec; + rec->flags |= FTRACE_FL_FREE; +} + static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { + struct dyn_ftrace *rec; + + /* First check for freed records */ + if (ftrace_free_records) { + rec = ftrace_free_records; + + /* todo, disable tracing altogether on this warning */ + if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { + WARN_ON_ONCE(1); + ftrace_free_records = NULL; + return NULL; + } + + ftrace_free_records = (void *)rec->ip; + memset(rec, 0, sizeof(*rec)); + return rec; + } + if (ftrace_pages->index == ENTRIES_PER_PAGE) { if (!ftrace_pages->next) return NULL; @@ -356,8 +385,16 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } failed = ftrace_modify_code(ip, old, new); - if (failed) - rec->flags |= FTRACE_FL_FAILED; + if (failed) { + unsigned long key; + /* It is possible that the function hasn't been converted yet */ + key = hash_long(ip, FTRACE_HASHBITS); + if (!ftrace_ip_in_hash(ip, key)) { + rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + } + + } } static void notrace ftrace_replace_code(int enable) @@ -407,8 +444,10 @@ ftrace_code_disable(struct dyn_ftrace *rec) call = ftrace_call_replace(ip, MCOUNT_ADDR); failed = ftrace_modify_code(ip, call, nop); - if (failed) + if (failed) { rec->flags |= FTRACE_FL_FAILED; + ftrace_free_rec(rec); + } } static int notrace __ftrace_modify_code(void *data) -- cgit v1.2.3-70-g09d2 From 4eebcc81a33fbc45e28542b50197ed7b3c486d90 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:20:48 +0200 Subject: ftrace: disable tracing on failure Since ftrace touches practically every function. If we detect any anomaly, we want to fully disable ftrace. This patch adds code to try shutdown ftrace as much as possible without doing any more harm is something is detected not quite correct. This only kills ftrace, this patch does have checks for other parts of the tracer (irqsoff, wakeup, etc.). Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 3 ++ kernel/trace/ftrace.c | 112 ++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace_selftest.c | 4 ++ 3 files changed, 110 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 61e757bd235..4650a3160b7 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -58,6 +58,9 @@ struct dyn_ftrace { int ftrace_force_update(void); void ftrace_set_filter(unsigned char *buf, int len, int reset); +/* totally disable ftrace - can not re-enable after this */ +void ftrace_kill(void); + /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern unsigned char *ftrace_nop_replace(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8e02aa690b2..ff42345dd78 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,9 +29,16 @@ #include "trace.h" -int ftrace_enabled; +/* ftrace_enabled is a method to turn ftrace on or off */ +int ftrace_enabled __read_mostly; static int last_ftrace_enabled; +/* + * ftrace_disabled is set when an anomaly is discovered. + * ftrace_disabled is much stronger than ftrace_enabled. + */ +static int ftrace_disabled __read_mostly; + static DEFINE_SPINLOCK(ftrace_lock); static DEFINE_MUTEX(ftrace_sysctl_lock); @@ -230,10 +237,11 @@ static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) if (ftrace_free_records) { rec = ftrace_free_records; - /* todo, disable tracing altogether on this warning */ if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { WARN_ON_ONCE(1); ftrace_free_records = NULL; + ftrace_disabled = 1; + ftrace_enabled = 0; return NULL; } @@ -260,7 +268,7 @@ ftrace_record_ip(unsigned long ip) int resched; int atomic; - if (!ftrace_enabled) + if (!ftrace_enabled || ftrace_disabled) return; resched = need_resched(); @@ -485,6 +493,9 @@ static void notrace ftrace_startup(void) { int command = 0; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); ftraced_suspend++; if (ftraced_suspend == 1) @@ -507,6 +518,9 @@ static void notrace ftrace_shutdown(void) { int command = 0; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); ftraced_suspend--; if (!ftraced_suspend) @@ -529,6 +543,9 @@ static void notrace ftrace_startup_sysctl(void) { int command = FTRACE_ENABLE_MCOUNT; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); /* Force update next time */ saved_ftrace_func = NULL; @@ -544,6 +561,9 @@ static void notrace ftrace_shutdown_sysctl(void) { int command = FTRACE_DISABLE_MCOUNT; + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftraced_lock); /* ftraced_suspend is true if ftrace is running */ if (ftraced_suspend) @@ -600,6 +620,9 @@ static int notrace __ftrace_update_code(void *ignore) static void notrace ftrace_update_code(void) { + if (unlikely(ftrace_disabled)) + return; + stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); } @@ -614,6 +637,9 @@ static int notrace ftraced(void *ignore) /* check once a second */ schedule_timeout(HZ); + if (unlikely(ftrace_disabled)) + continue; + mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) { @@ -628,6 +654,7 @@ static int notrace ftraced(void *ignore) ftrace_update_cnt != 1 ? "s" : "", ftrace_update_tot_cnt, usecs, usecs != 1 ? "s" : ""); + ftrace_disabled = 1; WARN_ON_ONCE(1); } ftraced_trigger = 0; @@ -785,6 +812,9 @@ ftrace_avail_open(struct inode *inode, struct file *file) struct ftrace_iterator *iter; int ret; + if (unlikely(ftrace_disabled)) + return -ENODEV; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; @@ -843,6 +873,9 @@ ftrace_filter_open(struct inode *inode, struct file *file) struct ftrace_iterator *iter; int ret = 0; + if (unlikely(ftrace_disabled)) + return -ENODEV; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; @@ -1063,6 +1096,9 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, */ notrace void ftrace_set_filter(unsigned char *buf, int len, int reset) { + if (unlikely(ftrace_disabled)) + return; + mutex_lock(&ftrace_filter_lock); if (reset) ftrace_filter_reset(); @@ -1133,7 +1169,7 @@ int ftrace_force_update(void) DECLARE_WAITQUEUE(wait, current); int ret = 0; - if (!ftraced_task) + if (unlikely(ftrace_disabled)) return -ENODEV; mutex_lock(&ftraced_lock); @@ -1142,6 +1178,11 @@ int ftrace_force_update(void) set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&ftraced_waiters, &wait); + if (unlikely(!ftraced_task)) { + ret = -ENODEV; + goto out; + } + do { mutex_unlock(&ftraced_lock); wake_up_process(ftraced_task); @@ -1154,6 +1195,7 @@ int ftrace_force_update(void) set_current_state(TASK_INTERRUPTIBLE); } while (last_counter == ftraced_iteration_counter); + out: mutex_unlock(&ftraced_lock); remove_wait_queue(&ftraced_waiters, &wait); set_current_state(TASK_RUNNING); @@ -1161,6 +1203,22 @@ int ftrace_force_update(void) return ret; } +static void ftrace_force_shutdown(void) +{ + struct task_struct *task; + int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC; + + mutex_lock(&ftraced_lock); + task = ftraced_task; + ftraced_task = NULL; + ftraced_suspend = -1; + ftrace_run_update_code(command); + mutex_unlock(&ftraced_lock); + + if (task) + kthread_stop(task); +} + static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; @@ -1194,21 +1252,29 @@ static int __init notrace ftrace_dynamic_init(void) stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); /* ftrace_dyn_arch_init places the return code in addr */ - if (addr) - return addr; + if (addr) { + ret = (int)addr; + goto failed; + } ret = ftrace_dyn_table_alloc(); if (ret) - return ret; + goto failed; p = kthread_run(ftraced, NULL, "ftraced"); - if (IS_ERR(p)) - return -1; + if (IS_ERR(p)) { + ret = -1; + goto failed; + } last_ftrace_enabled = ftrace_enabled = 1; ftraced_task = p; return 0; + + failed: + ftrace_disabled = 1; + return ret; } core_initcall(ftrace_dynamic_init); @@ -1217,8 +1283,30 @@ core_initcall(ftrace_dynamic_init); # define ftrace_shutdown() do { } while (0) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) +# define ftrace_force_shutdown() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ +/** + * ftrace_kill - totally shutdown ftrace + * + * This is a safety measure. If something was detected that seems + * wrong, calling this function will keep ftrace from doing + * any more modifications, and updates. + * used when something went wrong. + */ +void ftrace_kill(void) +{ + mutex_lock(&ftrace_sysctl_lock); + ftrace_disabled = 1; + ftrace_enabled = 0; + + clear_ftrace_function(); + mutex_unlock(&ftrace_sysctl_lock); + + /* Try to totally disable ftrace */ + ftrace_force_shutdown(); +} + /** * register_ftrace_function - register a function for profiling * @ops - ops structure that holds the function for profiling. @@ -1234,6 +1322,9 @@ int register_ftrace_function(struct ftrace_ops *ops) { int ret; + if (unlikely(ftrace_disabled)) + return -1; + mutex_lock(&ftrace_sysctl_lock); ret = __register_ftrace_function(ops); ftrace_startup(); @@ -1267,6 +1358,9 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, { int ret; + if (unlikely(ftrace_disabled)) + return -ENODEV; + mutex_lock(&ftrace_sysctl_lock); ret = proc_dointvec(table, write, file, buffer, lenp, ppos); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a6f1ed75f83..85715b86a34 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -248,6 +248,10 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; + /* kill ftrace totally if we failed */ + if (ret) + ftrace_kill(); + return ret; } #endif /* CONFIG_FTRACE */ -- cgit v1.2.3-70-g09d2 From aeaee8a2c9cb4489f166ca0e39c568e8254faaa6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:49 +0200 Subject: ftrace: build fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4650a3160b7..08fbef1744c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -58,9 +58,6 @@ struct dyn_ftrace { int ftrace_force_update(void); void ftrace_set_filter(unsigned char *buf, int len, int reset); -/* totally disable ftrace - can not re-enable after this */ -void ftrace_kill(void); - /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern unsigned char *ftrace_nop_replace(void); @@ -74,10 +71,13 @@ extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); #else -# define ftrace_force_update() ({ 0; }) -# define ftrace_set_filter(buf, len, reset) do { } while (0) +# define ftrace_force_update() ({ 0; }) +# define ftrace_set_filter(buf, len, reset) do { } while (0) #endif +/* totally disable ftrace - can not re-enable after this */ +void ftrace_kill(void); + static inline void tracer_disable(void) { #ifdef CONFIG_FTRACE -- cgit v1.2.3-70-g09d2 From 86387f7ee5d3273ff4859e2c64ce656639b6ca65 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: add stack tracing Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 2 + kernel/trace/Kconfig | 1 + kernel/trace/trace.c | 103 ++++++++++++++++++++++++++++++++++++++++--------- kernel/trace/trace.h | 11 ++++++ 4 files changed, 99 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 08fbef1744c..0d3714e7110 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -93,6 +93,7 @@ static inline void tracer_disable(void) # define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) # define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) # define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) +# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) #else # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) # define CALLER_ADDR1 0UL @@ -100,6 +101,7 @@ static inline void tracer_disable(void) # define CALLER_ADDR3 0UL # define CALLER_ADDR4 0UL # define CALLER_ADDR5 0UL +# define CALLER_ADDR6 0UL #endif #ifdef CONFIG_IRQSOFF_TRACER diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3f73a171024..eb1988ed84b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,6 +10,7 @@ config TRACER_MAX_TRACE config TRACING bool select DEBUG_FS + select STACKTRACE config FTRACE bool "Kernel Function Tracer" diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 192c1354a7e..b4b1b4fe99f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -28,6 +28,8 @@ #include #include +#include + #include "trace.h" unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; @@ -88,6 +90,7 @@ enum trace_type { TRACE_FN, TRACE_CTX, TRACE_WAKE, + TRACE_STACK, TRACE_SPECIAL, __TRACE_LAST_TYPE @@ -109,6 +112,7 @@ enum trace_iterator_flags { TRACE_ITER_HEX = 0x20, TRACE_ITER_BIN = 0x40, TRACE_ITER_BLOCK = 0x80, + TRACE_ITER_STACKTRACE = 0x100, }; #define TRACE_ITER_SYM_MASK \ @@ -124,10 +128,11 @@ static const char *trace_options[] = { "hex", "bin", "block", + "stacktrace", NULL }; -static unsigned trace_flags; +static unsigned trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_STACKTRACE; static DEFINE_SPINLOCK(ftrace_max_lock); @@ -657,7 +662,7 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up (&trace_wait); + wake_up(&trace_wait); } void @@ -685,13 +690,39 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up (&trace_wait); + wake_up(&trace_wait); +} + +void __trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip) +{ + struct trace_entry *entry; + struct stack_trace trace; + + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, flags); + entry->type = TRACE_STACK; + + memset(&entry->stack, 0, sizeof(entry->stack)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = skip; + trace.entries = entry->stack.caller; + + save_stack_trace(&trace); } void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, - struct task_struct *prev, struct task_struct *next, + struct task_struct *prev, + struct task_struct *next, unsigned long flags) { struct trace_entry *entry; @@ -706,16 +737,18 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.prev_state = prev->state; entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; + __trace_stack(tr, data, flags, 4); spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up (&trace_wait); + wake_up(&trace_wait); } void tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_array_cpu *data, - struct task_struct *wakee, struct task_struct *curr, + struct task_struct *wakee, + struct task_struct *curr, unsigned long flags) { struct trace_entry *entry; @@ -730,6 +763,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.prev_state = curr->state; entry->ctx.next_pid = wakee->pid; entry->ctx.next_prio = wakee->prio; + __trace_stack(tr, data, flags, 5); spin_unlock_irqrestore(&data->lock, irq_flags); if (!(trace_flags & TRACE_ITER_BLOCK)) @@ -1179,6 +1213,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) unsigned long rel_usecs; char *comm; int S; + int i; if (!next_entry) next_entry = entry; @@ -1197,8 +1232,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); } else { - lat_print_generic(s, entry, cpu); - lat_print_timestamp(s, abs_usecs, rel_usecs); + if (entry->type != TRACE_STACK) { + lat_print_generic(s, entry, cpu); + lat_print_timestamp(s, abs_usecs, rel_usecs); + } } switch (entry->type) { case TRACE_FN: @@ -1226,6 +1263,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) entry->special.arg2, entry->special.arg3); break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) + trace_seq_puts(s, " <= "); + seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); + } + trace_seq_puts(s, "\n"); + break; default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } @@ -1241,8 +1286,9 @@ static int print_trace_fmt(struct trace_iterator *iter) unsigned long long t; unsigned long secs; char *comm; - int S; int ret; + int S; + int i; entry = iter->ent; @@ -1252,15 +1298,17 @@ static int print_trace_fmt(struct trace_iterator *iter) usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; - ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); - if (!ret) - return 0; - ret = trace_seq_printf(s, "[%02d] ", iter->cpu); - if (!ret) - return 0; - ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); - if (!ret) - return 0; + if (entry->type != TRACE_STACK) { + ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + if (!ret) + return 0; + ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + if (!ret) + return 0; + ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); + if (!ret) + return 0; + } switch (entry->type) { case TRACE_FN: @@ -1303,6 +1351,22 @@ static int print_trace_fmt(struct trace_iterator *iter) if (!ret) return 0; break; + case TRACE_STACK: + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) { + ret = trace_seq_puts(s, " <= "); + if (!ret) + return 0; + } + ret = seq_print_ip_sym(s, entry->stack.caller[i], + sym_flags); + if (!ret) + return 0; + } + ret = trace_seq_puts(s, "\n"); + if (!ret) + return 0; + break; } return 1; } @@ -1344,6 +1408,7 @@ static int print_raw_fmt(struct trace_iterator *iter) return 0; break; case TRACE_SPECIAL: + case TRACE_STACK: ret = trace_seq_printf(s, " %lx %lx %lx\n", entry->special.arg1, entry->special.arg2, @@ -1399,6 +1464,7 @@ static int print_hex_fmt(struct trace_iterator *iter) SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); break; case TRACE_SPECIAL: + case TRACE_STACK: SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); @@ -1433,6 +1499,7 @@ static int print_bin_fmt(struct trace_iterator *iter) SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); break; case TRACE_SPECIAL: + case TRACE_STACK: SEQ_PUT_FIELD_RET(s, entry->special.arg1); SEQ_PUT_FIELD_RET(s, entry->special.arg2); SEQ_PUT_FIELD_RET(s, entry->special.arg3); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 90e0ba0f6eb..387bdcf45e2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -34,6 +34,16 @@ struct special_entry { unsigned long arg3; }; +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 5 + +struct stack_entry { + unsigned long caller[FTRACE_STACK_ENTRIES]; +}; + /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: @@ -51,6 +61,7 @@ struct trace_entry { struct ftrace_entry fn; struct ctx_switch_entry ctx; struct special_entry special; + struct stack_entry stack; }; }; -- cgit v1.2.3-70-g09d2 From 8ac0fca4ccb355ce50471d7aa3f10f5900b28b95 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:51 +0200 Subject: ftrace: sched tracer fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 6 ------ kernel/sched.c | 2 +- kernel/trace/trace_sched_wakeup.c | 13 +++---------- 3 files changed, 4 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e26f1fdbfe..05744f9cb09 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2130,17 +2130,11 @@ ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) #ifdef CONFIG_SCHED_TRACER extern void ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr); -extern void -ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr); #else static inline void ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) { } -static inline void -ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) -{ -} #endif extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); diff --git a/kernel/sched.c b/kernel/sched.c index 328494e28df..53ab1174664 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,7 +2613,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - ftrace_wake_up_new_task(p, rq->curr); + ftrace_wake_up_task(p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 87fa7b253b5..2a012423f9d 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -201,20 +201,13 @@ out: atomic_dec(&tr->data[cpu]->disabled); } -void -wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) +void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) { if (likely(!tracer_enabled)) return; - wakeup_check_start(wakeup_trace, wakee, curr); -} - -void -ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr) -{ - if (likely(!tracer_enabled)) - return; + tracing_record_cmdline(curr); + tracing_record_cmdline(wakee); wakeup_check_start(wakeup_trace, wakee, curr); } -- cgit v1.2.3-70-g09d2 From 4e65551905fb0300ae7e667cbaa41ee2e3f29a13 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: sched tracer, trace full rbtree Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 32 ++++++++++++++++------- kernel/sched.c | 35 ++++++++++++++++++++++--- kernel/trace/trace.c | 55 ++++++++++++++++----------------------- kernel/trace/trace.h | 14 ++++++++++ kernel/trace/trace_sched_switch.c | 24 +++++++++++------ 5 files changed, 108 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 05744f9cb09..652d380ae56 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2119,20 +2119,34 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) #ifdef CONFIG_CONTEXT_SWITCH_TRACER extern void -ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next); +ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next); +extern void +ftrace_wake_up_task(void *rq, struct task_struct *wakee, + struct task_struct *curr); +extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data); +extern void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3); #else static inline void -ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) +ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next) +{ +} +static inline void +sched_trace_special(unsigned long p1, unsigned long p2, unsigned long p3) +{ +} +static inline void +ftrace_wake_up_task(void *rq, struct task_struct *wakee, + struct task_struct *curr) +{ +} +static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) { } -#endif - -#ifdef CONFIG_SCHED_TRACER -extern void -ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr); -#else static inline void -ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) { } #endif diff --git a/kernel/sched.c b/kernel/sched.c index 53ab1174664..b9208a0e33a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2394,6 +2394,35 @@ static int sched_balance_self(int cpu, int flag) #endif /* CONFIG_SMP */ +#ifdef CONFIG_CONTEXT_SWITCH_TRACER + +void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) +{ + struct sched_entity *se; + struct task_struct *p; + struct rb_node *curr; + struct rq *rq = __rq; + + curr = first_fair(&rq->cfs); + if (!curr) + return; + + while (curr) { + se = rb_entry(curr, struct sched_entity, run_node); + if (!entity_is_task(se)) + continue; + + p = task_of(se); + + __trace_special(__tr, __data, + p->pid, p->se.vruntime, p->se.sum_exec_runtime); + + curr = rb_next(curr); + } +} + +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2468,7 +2497,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ - ftrace_wake_up_task(p, rq->curr); + ftrace_wake_up_task(rq, p, rq->curr); schedstat_inc(p, se.nr_wakeups); if (sync) schedstat_inc(p, se.nr_wakeups_sync); @@ -2613,7 +2642,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - ftrace_wake_up_task(p, rq->curr); + ftrace_wake_up_task(rq, p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2786,7 +2815,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - ftrace_ctx_switch(prev, next); + ftrace_ctx_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0e4b7119e26..65173b14b91 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -66,7 +66,18 @@ static struct tracer *current_trace __read_mostly; static int max_tracer_type_len; static DEFINE_MUTEX(trace_types_lock); -static DECLARE_WAIT_QUEUE_HEAD (trace_wait); +static DECLARE_WAIT_QUEUE_HEAD(trace_wait); + +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; + +/* + * FIXME: where should this be called? + */ +void trace_wake_up(void) +{ + if (!(trace_flags & TRACE_ITER_BLOCK)) + wake_up(&trace_wait); +} #define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) @@ -103,18 +114,6 @@ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x08, }; -enum trace_iterator_flags { - TRACE_ITER_PRINT_PARENT = 0x01, - TRACE_ITER_SYM_OFFSET = 0x02, - TRACE_ITER_SYM_ADDR = 0x04, - TRACE_ITER_VERBOSE = 0x08, - TRACE_ITER_RAW = 0x10, - TRACE_ITER_HEX = 0x20, - TRACE_ITER_BIN = 0x40, - TRACE_ITER_BLOCK = 0x80, - TRACE_ITER_STACKTRACE = 0x100, -}; - #define TRACE_ITER_SYM_MASK \ (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) @@ -132,8 +131,6 @@ static const char *trace_options[] = { NULL }; -static unsigned trace_flags = TRACE_ITER_PRINT_PARENT; - static DEFINE_SPINLOCK(ftrace_max_lock); /* @@ -660,9 +657,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } void @@ -673,10 +667,14 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } +#ifdef CONFIG_CONTEXT_SWITCH_TRACER + void -trace_special(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) { + struct trace_array_cpu *data = __data; + struct trace_array *tr = __tr; struct trace_entry *entry; unsigned long irq_flags; @@ -688,11 +686,10 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data, entry->special.arg2 = arg2; entry->special.arg3 = arg3; spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } +#endif + void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, @@ -739,9 +736,6 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.next_prio = next->prio; __trace_stack(tr, data, flags, 4); spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } void @@ -765,9 +759,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_prio = wakee->prio; __trace_stack(tr, data, flags, 5); spin_unlock_irqrestore(&data->lock, irq_flags); - - if (!(trace_flags & TRACE_ITER_BLOCK)) - wake_up(&trace_wait); } #ifdef CONFIG_FTRACE @@ -1258,7 +1249,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) comm); break; case TRACE_SPECIAL: - trace_seq_printf(s, " %lx %lx %lx\n", + trace_seq_printf(s, " %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1344,7 +1335,7 @@ static int print_trace_fmt(struct trace_iterator *iter) return 0; break; case TRACE_SPECIAL: - ret = trace_seq_printf(s, " %lx %lx %lx\n", + ret = trace_seq_printf(s, " %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1409,7 +1400,7 @@ static int print_raw_fmt(struct trace_iterator *iter) break; case TRACE_SPECIAL: case TRACE_STACK: - ret = trace_seq_printf(s, " %lx %lx %lx\n", + ret = trace_seq_printf(s, " %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 387bdcf45e2..75e23747567 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -274,4 +274,18 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, extern void *head_page(struct trace_array_cpu *data); +extern unsigned long trace_flags; + +enum trace_iterator_flags { + TRACE_ITER_PRINT_PARENT = 0x01, + TRACE_ITER_SYM_OFFSET = 0x02, + TRACE_ITER_SYM_ADDR = 0x04, + TRACE_ITER_VERBOSE = 0x08, + TRACE_ITER_RAW = 0x10, + TRACE_ITER_HEX = 0x20, + TRACE_ITER_BIN = 0x40, + TRACE_ITER_BLOCK = 0x80, + TRACE_ITER_STACKTRACE = 0x100, +}; + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8b1cf1a3aee..12658b3f2b2 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -18,7 +18,7 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; static void -ctx_switch_func(struct task_struct *prev, struct task_struct *next) +ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) { struct trace_array *tr = ctx_trace; struct trace_array_cpu *data; @@ -34,14 +34,17 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next) data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) + if (likely(disabled == 1)) { tracing_sched_switch_trace(tr, data, prev, next, flags); + ftrace_all_fair_tasks(__rq, tr, data); + } atomic_dec(&data->disabled); local_irq_restore(flags); } -static void wakeup_func(struct task_struct *wakee, struct task_struct *curr) +static void +wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) { struct trace_array *tr = ctx_trace; struct trace_array_cpu *data; @@ -57,14 +60,18 @@ static void wakeup_func(struct task_struct *wakee, struct task_struct *curr) data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) + if (likely(disabled == 1)) { tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + ftrace_all_fair_tasks(__rq, tr, data); + } atomic_dec(&data->disabled); local_irq_restore(flags); } -void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) +void +ftrace_ctx_switch(void *__rq, struct task_struct *prev, + struct task_struct *next) { tracing_record_cmdline(prev); @@ -72,7 +79,7 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) * If tracer_switch_func only points to the local * switch func, it still needs the ptr passed to it. */ - ctx_switch_func(prev, next); + ctx_switch_func(__rq, prev, next); /* * Chain to the wakeup tracer (this is a NOP if disabled): @@ -81,11 +88,12 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next) } void -ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr) +ftrace_wake_up_task(void *__rq, struct task_struct *wakee, + struct task_struct *curr) { tracing_record_cmdline(curr); - wakeup_func(wakee, curr); + wakeup_func(__rq, wakee, curr); /* * Chain to the wakeup tracer (this is a NOP if disabled): -- cgit v1.2.3-70-g09d2 From 017730c11241e26577673eb9d957cfc66172ea91 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: fix wakeups Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 2 ++ kernel/sched.c | 18 ++++++++++++++++++ kernel/trace/trace.c | 15 +++++++++++---- 3 files changed, 31 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 652d380ae56..a3970b56375 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -246,6 +246,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); +extern int runqueue_is_locked(void); + extern cpumask_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); diff --git a/kernel/sched.c b/kernel/sched.c index 673b588b713..9ca4a2e6a23 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -642,6 +642,24 @@ static inline void update_rq_clock(struct rq *rq) # define const_debug static const #endif +/** + * runqueue_is_locked + * + * Returns true if the current cpu runqueue is locked. + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(void) +{ + int cpu = get_cpu(); + struct rq *rq = cpu_rq(cpu); + int ret; + + ret = spin_is_locked(&rq->lock); + put_cpu(); + return ret; +} + /* * Debugging: various feature bits */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 65173b14b91..2ca9d66aa74 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -70,12 +70,13 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; -/* - * FIXME: where should this be called? - */ void trace_wake_up(void) { - if (!(trace_flags & TRACE_ITER_BLOCK)) + /* + * The runqueue_is_locked() can fail, but this is the best we + * have for now: + */ + if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) wake_up(&trace_wait); } @@ -657,6 +658,8 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, entry->fn.ip = ip; entry->fn.parent_ip = parent_ip; spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); } void @@ -686,6 +689,8 @@ __trace_special(void *__tr, void *__data, entry->special.arg2 = arg2; entry->special.arg3 = arg3; spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); } #endif @@ -759,6 +764,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_prio = wakee->prio; __trace_stack(tr, data, flags, 5); spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); } #ifdef CONFIG_FTRACE -- cgit v1.2.3-70-g09d2 From 1a3c3034336320554a3342572dae98d69e054fc7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:52 +0200 Subject: ftrace: fix __trace_special() Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 20 ++++++++++++-------- kernel/trace/trace.c | 4 ---- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a3970b56375..5b186bed54b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2119,6 +2119,18 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +#ifdef CONFIG_TRACING +extern void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3); +#else +static inline void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ +} +#endif + #ifdef CONFIG_CONTEXT_SWITCH_TRACER extern void ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next); @@ -2126,9 +2138,6 @@ extern void ftrace_wake_up_task(void *rq, struct task_struct *wakee, struct task_struct *curr); extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data); -extern void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3); #else static inline void ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next) @@ -2146,11 +2155,6 @@ ftrace_wake_up_task(void *rq, struct task_struct *wakee, static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) { } -static inline void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ -} #endif extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2ca9d66aa74..65d2c0a61ed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -670,8 +670,6 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } -#ifdef CONFIG_CONTEXT_SWITCH_TRACER - void __trace_special(void *__tr, void *__data, unsigned long arg1, unsigned long arg2, unsigned long arg3) @@ -693,8 +691,6 @@ __trace_special(void *__tr, void *__data, trace_wake_up(); } -#endif - void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, -- cgit v1.2.3-70-g09d2 From 88a4216c3ec4281fc7e6725cc3a3ccd01fb1aa14 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:20:53 +0200 Subject: ftrace: sched special Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 6 ++++++ kernel/sched_fair.c | 3 +++ kernel/trace/trace.c | 6 +++--- kernel/trace/trace_sched_switch.c | 24 ++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5b186bed54b..360ca99033d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2138,6 +2138,8 @@ extern void ftrace_wake_up_task(void *rq, struct task_struct *wakee, struct task_struct *curr); extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data); +extern void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); #else static inline void ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next) @@ -2155,6 +2157,10 @@ ftrace_wake_up_task(void *rq, struct task_struct *wakee, static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) { } +static inline void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ +} #endif extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e24ecd39c4b..dc1856f1079 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1061,6 +1061,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE)) return 0; + ftrace_special(__LINE__, curr->se.avg_overlap, sync); + ftrace_special(__LINE__, p->se.avg_overlap, -1); /* * If the currently running task will sleep within * a reasonable amount of time then attract this newly @@ -1238,6 +1240,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(se == pse)) return; + ftrace_special(__LINE__, p->pid, se->last_wakeup); cfs_rq_of(pse)->next = pse; /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3a4032492fc..b87a2641489 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1251,7 +1251,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) comm); break; case TRACE_SPECIAL: - trace_seq_printf(s, " %ld %ld %ld\n", + trace_seq_printf(s, "# %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1335,7 +1335,7 @@ static int print_trace_fmt(struct trace_iterator *iter) return 0; break; case TRACE_SPECIAL: - ret = trace_seq_printf(s, " %ld %ld %ld\n", + ret = trace_seq_printf(s, "# %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); @@ -1400,7 +1400,7 @@ static int print_raw_fmt(struct trace_iterator *iter) break; case TRACE_SPECIAL: case TRACE_STACK: - ret = trace_seq_printf(s, " %ld %ld %ld\n", + ret = trace_seq_printf(s, "# %ld %ld %ld\n", entry->special.arg1, entry->special.arg2, entry->special.arg3); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 5a217e86358..bddf676914e 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -103,6 +103,30 @@ ftrace_wake_up_task(void *__rq, struct task_struct *wakee, wakeup_sched_wakeup(wakee, curr); } +void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + struct trace_array *tr = ctx_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + + if (!tracer_enabled) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + __trace_special(tr, data, arg1, arg2, arg3); + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static void sched_switch_reset(struct trace_array *tr) { int cpu; -- cgit v1.2.3-70-g09d2 From 3eefae994d9224fb7771a3ddb683868363c23510 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:04 +0200 Subject: ftrace: limit trace entries Currently there is no protection from the root user to use up all of memory for trace buffers. If the root user allocates too many entries, the OOM killer might start kill off all tasks. This patch adds an algorith to check the following condition: pages_requested > (freeable_memory + current_trace_buffer_pages) / 4 If the above is met then the allocation fails. The above prevents more than 1/4th of freeable memory from being used by trace buffers. To determine the freeable_memory, I made determine_dirtyable_memory in mm/page-writeback.c global. Special thanks goes to Peter Zijlstra for suggesting the above calculation. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/writeback.h | 2 ++ kernel/trace/trace.c | 38 ++++++++++++++++++++++++++++++++++++++ mm/page-writeback.c | 10 +++++++--- 3 files changed, 47 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f462439cc28..bd91987c065 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -105,6 +105,8 @@ extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; +extern unsigned long determine_dirtyable_memory(void); + extern int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82ced406aac..2824cf48cdc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -51,6 +52,8 @@ static int trace_free_page(void); static int tracing_disabled = 1; +static unsigned long tracing_pages_allocated; + long ns2usecs(cycle_t nsec) { @@ -2591,12 +2594,41 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } if (val > global_trace.entries) { + long pages_requested; + unsigned long freeable_pages; + + /* make sure we have enough memory before mapping */ + pages_requested = + (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; + + /* account for each buffer (and max_tr) */ + pages_requested *= tracing_nr_buffers * 2; + + /* Check for overflow */ + if (pages_requested < 0) { + cnt = -ENOMEM; + goto out; + } + + freeable_pages = determine_dirtyable_memory(); + + /* we only allow to request 1/4 of useable memory */ + if (pages_requested > + ((freeable_pages + tracing_pages_allocated) / 4)) { + cnt = -ENOMEM; + goto out; + } + while (global_trace.entries < val) { if (trace_alloc_page()) { cnt = -ENOMEM; goto out; } + /* double check that we don't go over the known pages */ + if (tracing_pages_allocated > pages_requested) + break; } + } else { /* include the number of entries in val (inc of page entries) */ while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) @@ -2776,6 +2808,7 @@ static int trace_alloc_page(void) struct page *page, *tmp; LIST_HEAD(pages); void *array; + unsigned pages_allocated = 0; int i; /* first allocate a page for each CPU */ @@ -2787,6 +2820,7 @@ static int trace_alloc_page(void) goto free_pages; } + pages_allocated++; page = virt_to_page(array); list_add(&page->lru, &pages); @@ -2798,6 +2832,7 @@ static int trace_alloc_page(void) "for trace buffer!\n"); goto free_pages; } + pages_allocated++; page = virt_to_page(array); list_add(&page->lru, &pages); #endif @@ -2819,6 +2854,7 @@ static int trace_alloc_page(void) SetPageLRU(page); #endif } + tracing_pages_allocated += pages_allocated; global_trace.entries += ENTRIES_PER_PAGE; return 0; @@ -2853,6 +2889,8 @@ static int trace_free_page(void) page = list_entry(p, struct page, lru); ClearPageLRU(page); list_del(&page->lru); + tracing_pages_allocated--; + tracing_pages_allocated--; __free_page(page); tracing_reset(data); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 789b6adbef3..b38f700825f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -126,8 +126,6 @@ static void background_writeout(unsigned long _min_pages); static struct prop_descriptor vm_completions; static struct prop_descriptor vm_dirties; -static unsigned long determine_dirtyable_memory(void); - /* * couple the period to the dirty_ratio: * @@ -347,7 +345,13 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) #endif } -static unsigned long determine_dirtyable_memory(void) +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +unsigned long determine_dirtyable_memory(void) { unsigned long x; -- cgit v1.2.3-70-g09d2 From dc102a8fae2d0d6bf5223fc549247f2e23959ae6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 12 May 2008 21:21:09 +0200 Subject: Markers - remove extra format argument Denys Vlasenko : > Not in this patch, but I noticed: > > #define __trace_mark(name, call_private, format, args...) \ > do { \ > static const char __mstrtab_##name[] \ > __attribute__((section("__markers_strings"))) \ > = #name "\0" format; \ > static struct marker __mark_##name \ > __attribute__((section("__markers"), aligned(8))) = \ > { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \ > 0, 0, marker_probe_cb, \ > { __mark_empty_function, NULL}, NULL }; \ > __mark_check_format(format, ## args); \ > if (unlikely(__mark_##name.state)) { \ > (*__mark_##name.call) \ > (&__mark_##name, call_private, \ > format, ## args); \ > } \ > } while (0) > > In this call: > > (*__mark_##name.call) \ > (&__mark_##name, call_private, \ > format, ## args); \ > > you make gcc allocate duplicate format string. You can use > &__mstrtab_##name[sizeof(#name)] instead since it holds the same string, > or drop ", format," above and "const char *fmt" from here: > > void (*call)(const struct marker *mdata, /* Probe wrapper */ > void *call_private, const char *fmt, ...); > > since mdata->format is the same and all callees which need it can take it there. Very good point. I actually thought about dropping it, since it would remove an unnecessary argument from the stack. And actually, since I now have the marker_probe_cb sitting between the marker site and the callbacks, there is no API change required. Thanks :) Mathieu Signed-off-by: Mathieu Desnoyers CC: Denys Vlasenko Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/marker.h | 11 +++++------ kernel/marker.c | 30 ++++++++++++++---------------- 2 files changed, 19 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/marker.h b/include/linux/marker.h index 430f6adf976..338533abb47 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -44,8 +44,8 @@ struct marker { */ char state; /* Marker state. */ char ptype; /* probe type : 0 : single, 1 : multi */ - void (*call)(const struct marker *mdata, /* Probe wrapper */ - void *call_private, const char *fmt, ...); + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); struct marker_probe_closure single; struct marker_probe_closure *multi; } __attribute__((aligned(8))); @@ -72,8 +72,7 @@ struct marker { __mark_check_format(format, ## args); \ if (unlikely(__mark_##name.state)) { \ (*__mark_##name.call) \ - (&__mark_##name, call_private, \ - format, ## args); \ + (&__mark_##name, call_private, ## args);\ } \ } while (0) @@ -117,9 +116,9 @@ static inline void __printf(1, 2) ___mark_check_format(const char *fmt, ...) extern marker_probe_func __mark_empty_function; extern void marker_probe_cb(const struct marker *mdata, - void *call_private, const char *fmt, ...); + void *call_private, ...); extern void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, const char *fmt, ...); + void *call_private, ...); /* * Connect a probe to a marker. diff --git a/kernel/marker.c b/kernel/marker.c index b5a9fe1d50d..1abfb923b76 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex); struct marker_entry { struct hlist_node hlist; char *format; - void (*call)(const struct marker *mdata, /* Probe wrapper */ - void *call_private, const char *fmt, ...); + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); struct marker_probe_closure single; struct marker_probe_closure *multi; int refcount; /* Number of times armed. 0 if disarmed. */ @@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function); * marker_probe_cb Callback that prepares the variable argument list for probes. * @mdata: pointer of type struct marker * @call_private: caller site private data - * @fmt: format string * @...: Variable argument list. * * Since we do not use "typical" pointer based RCU in the 1 argument case, we * need to put a full smp_rmb() in this branch. This is why we do not use * rcu_dereference() for the pointer read. */ -void marker_probe_cb(const struct marker *mdata, void *call_private, - const char *fmt, ...) +void marker_probe_cb(const struct marker *mdata, void *call_private, ...) { va_list args; char ptype; @@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); - va_start(args, fmt); - func(mdata->single.probe_private, call_private, fmt, &args); + va_start(args, call_private); + func(mdata->single.probe_private, call_private, mdata->format, + &args); va_end(args); } else { struct marker_probe_closure *multi; @@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, smp_read_barrier_depends(); multi = mdata->multi; for (i = 0; multi[i].func; i++) { - va_start(args, fmt); - multi[i].func(multi[i].probe_private, call_private, fmt, - &args); + va_start(args, call_private); + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); va_end(args); } } @@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb); * marker_probe_cb Callback that does not prepare the variable argument list. * @mdata: pointer of type struct marker * @call_private: caller site private data - * @fmt: format string * @...: Variable argument list. * * Should be connected to markers "MARK_NOARGS". */ -void marker_probe_cb_noarg(const struct marker *mdata, - void *call_private, const char *fmt, ...) +void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) { va_list args; /* not initialized */ char ptype; @@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); - func(mdata->single.probe_private, call_private, fmt, &args); + func(mdata->single.probe_private, call_private, mdata->format, + &args); } else { struct marker_probe_closure *multi; int i; @@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, smp_read_barrier_depends(); multi = mdata->multi; for (i = 0; multi[i].func; i++) - multi[i].func(multi[i].probe_private, call_private, fmt, - &args); + multi[i].func(multi[i].probe_private, call_private, + mdata->format, &args); } preempt_enable(); } -- cgit v1.2.3-70-g09d2 From 0aa977f592f17004f9d1d545f2e1bb9ea71896c3 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 12 May 2008 21:21:10 +0200 Subject: Markers - define non optimized marker To support the forthcoming "immediate values" marker optimization, we must have a way to declare markers in few code paths that does not use instruction modification based enable. This will be the case of printk(), some traps and eventually lockdep instrumentation. Changelog : - Fix reversed boolean logic of "generic". Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/marker.h | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/marker.h b/include/linux/marker.h index 338533abb47..1290653f924 100644 --- a/include/linux/marker.h +++ b/include/linux/marker.h @@ -58,8 +58,12 @@ struct marker { * Make sure the alignment of the structure in the __markers section will * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. + * + * The "generic" argument controls which marker enabling mechanism must be used. + * If generic is true, a variable read is used. + * If generic is false, immediate values are used. */ -#define __trace_mark(name, call_private, format, args...) \ +#define __trace_mark(generic, name, call_private, format, args...) \ do { \ static const char __mstrtab_##name[] \ __attribute__((section("__markers_strings"))) \ @@ -79,7 +83,7 @@ struct marker { extern void marker_update_probe_range(struct marker *begin, struct marker *end); #else /* !CONFIG_MARKERS */ -#define __trace_mark(name, call_private, format, args...) \ +#define __trace_mark(generic, name, call_private, format, args...) \ __mark_check_format(format, ## args) static inline void marker_update_probe_range(struct marker *begin, struct marker *end) @@ -87,15 +91,30 @@ static inline void marker_update_probe_range(struct marker *begin, #endif /* CONFIG_MARKERS */ /** - * trace_mark - Marker + * trace_mark - Marker using code patching * @name: marker name, not quoted. * @format: format string * @args...: variable argument list * - * Places a marker. + * Places a marker using optimized code patching technique (imv_read()) + * to be enabled when immediate values are present. */ #define trace_mark(name, format, args...) \ - __trace_mark(name, NULL, format, ## args) + __trace_mark(0, name, NULL, format, ## args) + +/** + * _trace_mark - Marker using variable read + * @name: marker name, not quoted. + * @format: format string + * @args...: variable argument list + * + * Places a marker using a standard memory read (_imv_read()) to be + * enabled. Should be used for markers in code paths where instruction + * modification based enabling is not welcome. (__init and __exit functions, + * lockdep, some traps, printk). + */ +#define _trace_mark(name, format, args...) \ + __trace_mark(1, name, NULL, format, ## args) /** * MARK_NOARGS - Format string for a marker with no argument. -- cgit v1.2.3-70-g09d2 From 5b82a1b08a00b2adca3d9dd9777efff40b7aaaa1 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 12 May 2008 21:21:10 +0200 Subject: Port ftrace to markers Porting ftrace to the marker infrastructure. Don't need to chain to the wakeup tracer from the sched tracer, because markers support multiple probes connected. Signed-off-by: Mathieu Desnoyers CC: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 32 ------- kernel/sched.c | 14 +++- kernel/trace/trace.h | 20 +---- kernel/trace/trace_sched_switch.c | 171 +++++++++++++++++++++++++++++++------- kernel/trace/trace_sched_wakeup.c | 106 +++++++++++++++++++++-- 5 files changed, 255 insertions(+), 88 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 360ca99033d..c0b1c69b55c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2131,38 +2131,6 @@ __trace_special(void *__tr, void *__data, } #endif -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -extern void -ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next); -extern void -ftrace_wake_up_task(void *rq, struct task_struct *wakee, - struct task_struct *curr); -extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data); -extern void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); -#else -static inline void -ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next) -{ -} -static inline void -sched_trace_special(unsigned long p1, unsigned long p2, unsigned long p3) -{ -} -static inline void -ftrace_wake_up_task(void *rq, struct task_struct *wakee, - struct task_struct *curr) -{ -} -static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data) -{ -} -static inline void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ -} -#endif - extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); extern long sched_getaffinity(pid_t pid, cpumask_t *mask); diff --git a/kernel/sched.c b/kernel/sched.c index ad95cca4e42..e2e985eeee7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2500,7 +2500,9 @@ out_activate: success = 1; out_running: - ftrace_wake_up_task(rq, p, rq->curr); + trace_mark(kernel_sched_wakeup, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); check_preempt_curr(rq, p); p->state = TASK_RUNNING; @@ -2631,7 +2633,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - ftrace_wake_up_task(rq, p, rq->curr); + trace_mark(kernel_sched_wakeup_new, + "pid %d state %ld ## rq %p task %p rq->curr %p", + p->pid, p->state, rq, p, rq->curr); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2804,7 +2808,11 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - ftrace_ctx_switch(rq, prev, next); + trace_mark(kernel_sched_schedule, + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + prev->pid, next->pid, prev->state, + rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8845033ab49..f5de0601b40 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -234,25 +234,10 @@ void update_max_tr_single(struct trace_array *tr, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_SCHED_TRACER -extern void -wakeup_sched_switch(struct task_struct *prev, struct task_struct *next); -extern void -wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr); -#else -static inline void -wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) -{ -} -static inline void -wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) -{ -} -#endif - #ifdef CONFIG_CONTEXT_SWITCH_TRACER typedef void (*tracer_switch_func_t)(void *private, + void *__rq, struct task_struct *prev, struct task_struct *next); @@ -262,9 +247,6 @@ struct tracer_switch_ops { struct tracer_switch_ops *next; }; -extern int register_tracer_switch(struct tracer_switch_ops *ops); -extern int unregister_tracer_switch(struct tracer_switch_ops *ops); - #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index a3376478fc2..d25ffa5eaf2 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -16,11 +16,14 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; +static atomic_t sched_ref; static void -ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) +sched_switch_func(void *private, void *__rq, struct task_struct *prev, + struct task_struct *next) { - struct trace_array *tr = ctx_trace; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; long disabled; @@ -41,10 +44,40 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next) local_irq_restore(flags); } +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + if (!atomic_read(&sched_ref)) + return; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_record_cmdline(prev); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + sched_switch_func(probe_data, __rq, prev, next); +} + static void -wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) +wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct + task_struct *curr) { - struct trace_array *tr = ctx_trace; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; long disabled; @@ -67,35 +100,29 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr) local_irq_restore(flags); } -void -ftrace_ctx_switch(void *__rq, struct task_struct *prev, - struct task_struct *next) +static notrace void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) { - if (unlikely(atomic_read(&trace_record_cmdline_enabled))) - tracing_record_cmdline(prev); + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; - /* - * If tracer_switch_func only points to the local - * switch func, it still needs the ptr passed to it. - */ - ctx_switch_func(__rq, prev, next); + if (likely(!tracer_enabled)) + return; - /* - * Chain to the wakeup tracer (this is a NOP if disabled): - */ - wakeup_sched_switch(prev, next); -} + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); -void -ftrace_wake_up_task(void *__rq, struct task_struct *wakee, - struct task_struct *curr) -{ - wakeup_func(__rq, wakee, curr); + tracing_record_cmdline(task); + tracing_record_cmdline(curr); - /* - * Chain to the wakeup tracer (this is a NOP if disabled): - */ - wakeup_sched_wakeup(wakee, curr); + wakeup_func(probe_data, __rq, task, curr); } void @@ -132,15 +159,95 @@ static void sched_switch_reset(struct trace_array *tr) tracing_reset(tr->data[cpu]); } +static int tracing_sched_register(void) +{ + int ret; + + ret = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &ctx_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + return ret; + } + + ret = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &ctx_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &ctx_trace); + if (ret) { + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + + return ret; +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &ctx_trace); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &ctx_trace); + return ret; +} + +static void tracing_sched_unregister(void) +{ + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &ctx_trace); + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &ctx_trace); + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &ctx_trace); +} + +void tracing_start_sched_switch(void) +{ + long ref; + + ref = atomic_inc_return(&sched_ref); + if (ref == 1) + tracing_sched_register(); +} + +void tracing_stop_sched_switch(void) +{ + long ref; + + ref = atomic_dec_and_test(&sched_ref); + if (ref) + tracing_sched_unregister(); +} + static void start_sched_trace(struct trace_array *tr) { sched_switch_reset(tr); atomic_inc(&trace_record_cmdline_enabled); tracer_enabled = 1; + tracing_start_sched_switch(); } static void stop_sched_trace(struct trace_array *tr) { + tracing_stop_sched_switch(); atomic_dec(&trace_record_cmdline_enabled); tracer_enabled = 0; } @@ -181,6 +288,14 @@ static struct tracer sched_switch_trace __read_mostly = __init static int init_sched_switch_trace(void) { + int ret = 0; + + if (atomic_read(&sched_ref)) + ret = tracing_sched_register(); + if (ret) { + pr_info("error registering scheduler trace\n"); + return ret; + } return register_tracer(&sched_switch_trace); } device_initcall(init_sched_switch_trace); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5948011006b..5d2fb48e47f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "trace.h" @@ -44,11 +45,13 @@ static int report_latency(cycle_t delta) return 1; } -void -wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) +static void notrace +wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, + struct task_struct *next) { unsigned long latency = 0, t0 = 0, t1 = 0; - struct trace_array *tr = wakeup_trace; + struct trace_array **ptr = private; + struct trace_array *tr = *ptr; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; @@ -113,6 +116,31 @@ out: atomic_dec(&tr->data[cpu]->disabled); } +static notrace void +sched_switch_callback(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + struct task_struct *prev; + struct task_struct *next; + struct rq *__rq; + + /* skip prev_pid %d next_pid %d prev_state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, int); + (void)va_arg(*args, long); + __rq = va_arg(*args, typeof(__rq)); + prev = va_arg(*args, typeof(prev)); + next = va_arg(*args, typeof(next)); + + tracing_record_cmdline(prev); + + /* + * If tracer_switch_func only points to the local + * switch func, it still needs the ptr passed to it. + */ + wakeup_sched_switch(probe_data, __rq, prev, next); +} + static void __wakeup_reset(struct trace_array *tr) { struct trace_array_cpu *data; @@ -188,19 +216,68 @@ out: atomic_dec(&tr->data[cpu]->disabled); } -void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr) +static notrace void +wake_up_callback(void *probe_data, void *call_data, + const char *format, va_list *args) { + struct trace_array **ptr = probe_data; + struct trace_array *tr = *ptr; + struct task_struct *curr; + struct task_struct *task; + struct rq *__rq; + if (likely(!tracer_enabled)) return; + /* Skip pid %d state %ld */ + (void)va_arg(*args, int); + (void)va_arg(*args, long); + /* now get the meat: "rq %p task %p rq->curr %p" */ + __rq = va_arg(*args, typeof(__rq)); + task = va_arg(*args, typeof(task)); + curr = va_arg(*args, typeof(curr)); + + tracing_record_cmdline(task); tracing_record_cmdline(curr); - tracing_record_cmdline(wakee); - wakeup_check_start(wakeup_trace, wakee, curr); + wakeup_check_start(tr, task, curr); } static void start_wakeup_tracer(struct trace_array *tr) { + int ret; + + ret = marker_probe_register("kernel_sched_wakeup", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup\n"); + return; + } + + ret = marker_probe_register("kernel_sched_wakeup_new", + "pid %d state %ld ## rq %p task %p rq->curr %p", + wake_up_callback, + &wakeup_trace); + if (ret) { + pr_info("wakeup trace: Couldn't add marker" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = marker_probe_register("kernel_sched_schedule", + "prev_pid %d next_pid %d prev_state %ld " + "## rq %p prev %p next %p", + sched_switch_callback, + &wakeup_trace); + if (ret) { + pr_info("sched trace: Couldn't add marker" + " probe to kernel_sched_schedule\n"); + goto fail_deprobe_wake_new; + } + wakeup_reset(tr); /* @@ -215,11 +292,28 @@ static void start_wakeup_tracer(struct trace_array *tr) tracer_enabled = 1; return; +fail_deprobe_wake_new: + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); +fail_deprobe: + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); } static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; + marker_probe_unregister("kernel_sched_schedule", + sched_switch_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup_new", + wake_up_callback, + &wakeup_trace); + marker_probe_unregister("kernel_sched_wakeup", + wake_up_callback, + &wakeup_trace); } static void wakeup_tracer_init(struct trace_array *tr) -- cgit v1.2.3-70-g09d2 From 74f4e369fc5b52433ad824cef32d3bf1304549be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:15 +0200 Subject: ftrace: stacktrace fix Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 8 ++++++++ kernel/semaphore.c | 2 ++ kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 2 +- 4 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0d3714e7110..017ab44d572 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -120,4 +120,12 @@ static inline void tracer_disable(void) # define trace_preempt_off(a0, a1) do { } while (0) #endif +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); +#else +static inline void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 5c2942e768c..1a064adab65 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -31,6 +31,7 @@ #include #include #include +#include static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -53,6 +54,7 @@ void down(struct semaphore *sem) { unsigned long flags; + ftrace_special(sem->count, 0, __LINE__); spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2824cf48cdc..3271916ff03 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -901,7 +901,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; entry->ctx.next_state = next->state; - __trace_stack(tr, data, flags, 4); + __trace_stack(tr, data, flags, 5); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); } @@ -927,7 +927,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_pid = wakee->pid; entry->ctx.next_prio = wakee->prio; entry->ctx.next_state = wakee->state; - __trace_stack(tr, data, flags, 5); + __trace_stack(tr, data, flags, 6); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f5de0601b40..c460e85e94e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -51,7 +51,7 @@ struct special_entry { * Stack-trace entry: */ -#define FTRACE_STACK_ENTRIES 5 +#define FTRACE_STACK_ENTRIES 8 struct stack_entry { unsigned long caller[FTRACE_STACK_ENTRIES]; -- cgit v1.2.3-70-g09d2 From d49dbf33f0bf8748ee3662b973eb57e60525d622 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 16 May 2008 10:41:53 +0200 Subject: ftrace: fix include file dependency Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 017ab44d572..911d5d80b49 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -4,6 +4,7 @@ #ifdef CONFIG_FTRACE #include +#include extern int ftrace_enabled; extern int -- cgit v1.2.3-70-g09d2 From 489f139614596cbc956a06f5e4bb41288e276fe3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 Feb 2008 13:38:05 +0100 Subject: ftrace: fix build bug Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 911d5d80b49..922e23d0196 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -106,16 +106,16 @@ static inline void tracer_disable(void) #endif #ifdef CONFIG_IRQSOFF_TRACER - extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1); - extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1); + extern void time_hardirqs_on(unsigned long a0, unsigned long a1); + extern void time_hardirqs_off(unsigned long a0, unsigned long a1); #else # define time_hardirqs_on(a0, a1) do { } while (0) # define time_hardirqs_off(a0, a1) do { } while (0) #endif #ifdef CONFIG_PREEMPT_TRACER - extern void notrace trace_preempt_on(unsigned long a0, unsigned long a1); - extern void notrace trace_preempt_off(unsigned long a0, unsigned long a1); + extern void trace_preempt_on(unsigned long a0, unsigned long a1); + extern void trace_preempt_off(unsigned long a0, unsigned long a1); #else # define trace_preempt_on(a0, a1) do { } while (0) # define trace_preempt_off(a0, a1) do { } while (0) -- cgit v1.2.3-70-g09d2 From 8b7d89d02ef3c6a7c73d6596f28cea7632850af4 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:56 +0200 Subject: x86: mmiotrace - trace memory mapped IO Mmiotrace is a tool for trapping memory mapped IO (MMIO) accesses within the kernel. It is used for debugging and especially for reverse engineering evil binary drivers. Mmiotrace works by wrapping the ioremap family of kernel functions and marking the returned pages as not present. Access to the IO memory triggers a page fault, which will be handled by mmiotrace's custom page fault handler. This will single-step the faulted instruction with the MMIO page marked as present. Access logs are directed to user space via relay and debug_fs. This page fault approach is necessary, because binary drivers have readl/writel etc. calls inlined and therefore extremely difficult to trap with with e.g. kprobes. This patch depends on the custom page fault handlers patch. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig.debug | 27 ++ arch/x86/kernel/Makefile | 2 + arch/x86/kernel/init_task.c | 1 + arch/x86/kernel/mmiotrace/Makefile | 4 + arch/x86/kernel/mmiotrace/kmmio.c | 391 ++++++++++++++++++++++ arch/x86/kernel/mmiotrace/kmmio.h | 58 ++++ arch/x86/kernel/mmiotrace/mmio-mod.c | 527 ++++++++++++++++++++++++++++++ arch/x86/kernel/mmiotrace/pf_in.c | 489 +++++++++++++++++++++++++++ arch/x86/kernel/mmiotrace/pf_in.h | 39 +++ arch/x86/kernel/mmiotrace/testmmiotrace.c | 77 +++++ include/linux/mmiotrace.h | 62 ++++ 11 files changed, 1677 insertions(+) create mode 100644 arch/x86/kernel/mmiotrace/Makefile create mode 100644 arch/x86/kernel/mmiotrace/kmmio.c create mode 100644 arch/x86/kernel/mmiotrace/kmmio.h create mode 100644 arch/x86/kernel/mmiotrace/mmio-mod.c create mode 100644 arch/x86/kernel/mmiotrace/pf_in.c create mode 100644 arch/x86/kernel/mmiotrace/pf_in.h create mode 100644 arch/x86/kernel/mmiotrace/testmmiotrace.c create mode 100644 include/linux/mmiotrace.h (limited to 'include/linux') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 9431a839984..7c6496e2225 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -176,6 +176,33 @@ config PAGE_FAULT_HANDLERS register a function that is called on every page fault. Custom handlers are used by some debugging and reverse engineering tools. +config MMIOTRACE + tristate "Memory mapped IO tracing" + depends on DEBUG_KERNEL && PAGE_FAULT_HANDLERS && RELAY && DEBUG_FS + default n + help + This will build a kernel module called mmiotrace. + + Mmiotrace traces Memory Mapped I/O access and is meant for debugging + and reverse engineering. The kernel module offers wrapped + versions of the ioremap family of functions. The driver to be traced + must be modified to call these wrappers. A user space program is + required to collect the MMIO data. + + See http://nouveau.freedesktop.org/wiki/MmioTrace + If you are not helping to develop drivers, say N. + +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + default n + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + # # IO delay types: # diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 739d49acd2f..a51ac153685 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -79,6 +79,8 @@ obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_MMIOTRACE) += mmiotrace/ + obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index a4f93b4120c..027a5b6a12b 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -15,6 +15,7 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ +EXPORT_SYMBOL_GPL(init_mm); /* * Initial thread structure. diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile new file mode 100644 index 00000000000..d6905f7f981 --- /dev/null +++ b/arch/x86/kernel/mmiotrace/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_MMIOTRACE) += mmiotrace.o +mmiotrace-objs := pf_in.o kmmio.o mmio-mod.o + +obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c new file mode 100644 index 00000000000..8ba48f9c91b --- /dev/null +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -0,0 +1,391 @@ +/* Support for MMIO probes. + * Benfit many code from kprobes + * (C) 2002 Louis Zhuang . + * 2007 Alexander Eichner + * 2008 Pekka Paalanen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kmmio.h" + +#define KMMIO_HASH_BITS 6 +#define KMMIO_TABLE_SIZE (1 << KMMIO_HASH_BITS) +#define KMMIO_PAGE_HASH_BITS 4 +#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) + +struct kmmio_context { + struct kmmio_fault_page *fpage; + struct kmmio_probe *probe; + unsigned long saved_flags; + int active; +}; + +static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address); +static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, + void *args); + +static DEFINE_SPINLOCK(kmmio_lock); + +/* These are protected by kmmio_lock */ +unsigned int kmmio_count; +static unsigned int handler_registered; +static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; +static LIST_HEAD(kmmio_probes); + +static struct kmmio_context kmmio_ctx[NR_CPUS]; + +static struct pf_handler kmmio_pf_hook = { + .handler = kmmio_page_fault +}; + +static struct notifier_block nb_die = { + .notifier_call = kmmio_die_notifier +}; + +int init_kmmio(void) +{ + int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kmmio_page_table[i]); + + register_die_notifier(&nb_die); + return 0; +} + +void cleanup_kmmio(void) +{ + /* + * Assume the following have been already cleaned by calling + * unregister_kmmio_probe() appropriately: + * kmmio_page_table, kmmio_probes + */ + if (handler_registered) { + unregister_page_fault_handler(&kmmio_pf_hook); + synchronize_rcu(); + } + unregister_die_notifier(&nb_die); +} + +/* + * this is basically a dynamic stabbing problem: + * Could use the existing prio tree code or + * Possible better implementations: + * The Interval Skip List: A Data Structure for Finding All Intervals That + * Overlap a Point (might be simple) + * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup + */ +/* Get the kmmio at this addr (if any). You must be holding kmmio_lock. */ +static struct kmmio_probe *get_kmmio_probe(unsigned long addr) +{ + struct kmmio_probe *p; + list_for_each_entry(p, &kmmio_probes, list) { + if (addr >= p->addr && addr <= (p->addr + p->len)) + return p; + } + return NULL; +} + +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) +{ + struct list_head *head, *tmp; + + page &= PAGE_MASK; + head = &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; + list_for_each(tmp, head) { + struct kmmio_fault_page *p + = list_entry(tmp, struct kmmio_fault_page, list); + if (p->page == page) + return p; + } + + return NULL; +} + +static void arm_kmmio_fault_page(unsigned long page, int *large) +{ + unsigned long address = page & PAGE_MASK; + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud = pud_offset(pgd, address); + pmd_t *pmd = pmd_offset(pud, address); + pte_t *pte = pte_offset_kernel(pmd, address); + + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_PRESENT)); + if (large) + *large = 1; + } else { + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + } + + __flush_tlb_one(page); +} + +static void disarm_kmmio_fault_page(unsigned long page, int *large) +{ + unsigned long address = page & PAGE_MASK; + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud = pud_offset(pgd, address); + pmd_t *pmd = pmd_offset(pud, address); + pte_t *pte = pte_offset_kernel(pmd, address); + + if (large && *large) { + set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_PRESENT)); + *large = 0; + } else { + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + } + + __flush_tlb_one(page); +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate + * and they remain disabled thorough out this function. + */ +static int kmmio_handler(struct pt_regs *regs, unsigned long addr) +{ + struct kmmio_context *ctx; + int cpu; + + /* + * Preemption is now disabled to prevent process switch during + * single stepping. We can only handle one active kmmio trace + * per cpu, so ensure that we finish it before something else + * gets to run. + * + * XXX what if an interrupt occurs between returning from + * do_page_fault() and entering the single-step exception handler? + * And that interrupt triggers a kmmio trap? + */ + preempt_disable(); + cpu = smp_processor_id(); + ctx = &kmmio_ctx[cpu]; + + /* interrupts disabled and CPU-local data => atomicity guaranteed. */ + if (ctx->active) { + /* + * This avoids a deadlock with kmmio_lock. + * If this page fault really was due to kmmio trap, + * all hell breaks loose. + */ + printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, " + "for address %lu. Ignoring.\n", + cpu, addr); + goto no_kmmio; + } + ctx->active++; + + /* + * Acquire the kmmio lock to prevent changes affecting + * get_kmmio_fault_page() and get_kmmio_probe(), since we save their + * returned pointers. + * The lock is released in post_kmmio_handler(). + * XXX: could/should get_kmmio_*() be using RCU instead of spinlock? + */ + spin_lock(&kmmio_lock); + + ctx->fpage = get_kmmio_fault_page(addr); + if (!ctx->fpage) { + /* this page fault is not caused by kmmio */ + goto no_kmmio_locked; + } + + ctx->probe = get_kmmio_probe(addr); + ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK)); + + if (ctx->probe && ctx->probe->pre_handler) + ctx->probe->pre_handler(ctx->probe, regs, addr); + + regs->flags |= TF_MASK; + regs->flags &= ~IF_MASK; + + /* We hold lock, now we set present bit in PTE and single step. */ + disarm_kmmio_fault_page(ctx->fpage->page, NULL); + + return 1; + +no_kmmio_locked: + spin_unlock(&kmmio_lock); + ctx->active--; +no_kmmio: + preempt_enable_no_resched(); + /* page fault not handled by kmmio */ + return 0; +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate + * and they remain disabled thorough out this function. + * And we hold kmmio lock. + */ +static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + struct kmmio_context *ctx = &kmmio_ctx[cpu]; + + if (!ctx->active) + return 0; + + if (ctx->probe && ctx->probe->post_handler) + ctx->probe->post_handler(ctx->probe, condition, regs); + + arm_kmmio_fault_page(ctx->fpage->page, NULL); + + regs->flags &= ~TF_MASK; + regs->flags |= ctx->saved_flags; + + /* These were acquired in kmmio_handler(). */ + ctx->active--; + spin_unlock(&kmmio_lock); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, flags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->flags & TF_MASK) + return 0; + + return 1; +} + +static int add_kmmio_fault_page(unsigned long page) +{ + struct kmmio_fault_page *f; + + page &= PAGE_MASK; + f = get_kmmio_fault_page(page); + if (f) { + f->count++; + return 0; + } + + f = kmalloc(sizeof(*f), GFP_ATOMIC); + if (!f) + return -1; + + f->count = 1; + f->page = page; + list_add(&f->list, + &kmmio_page_table[hash_long(f->page, KMMIO_PAGE_HASH_BITS)]); + + arm_kmmio_fault_page(f->page, NULL); + + return 0; +} + +static void release_kmmio_fault_page(unsigned long page) +{ + struct kmmio_fault_page *f; + + page &= PAGE_MASK; + f = get_kmmio_fault_page(page); + if (!f) + return; + + f->count--; + if (!f->count) { + disarm_kmmio_fault_page(f->page, NULL); + list_del(&f->list); + } +} + +int register_kmmio_probe(struct kmmio_probe *p) +{ + int ret = 0; + unsigned long size = 0; + + spin_lock_irq(&kmmio_lock); + kmmio_count++; + if (get_kmmio_probe(p->addr)) { + ret = -EEXIST; + goto out; + } + list_add(&p->list, &kmmio_probes); + /*printk("adding fault pages...\n");*/ + while (size < p->len) { + if (add_kmmio_fault_page(p->addr + size)) + printk(KERN_ERR "mmio: Unable to set page fault.\n"); + size += PAGE_SIZE; + } + + if (!handler_registered) { + register_page_fault_handler(&kmmio_pf_hook); + handler_registered++; + } + +out: + spin_unlock_irq(&kmmio_lock); + /* + * XXX: What should I do here? + * Here was a call to global_flush_tlb(), but it does not exist + * anymore. + */ + return ret; +} + +void unregister_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long size = 0; + + spin_lock_irq(&kmmio_lock); + while (size < p->len) { + release_kmmio_fault_page(p->addr + size); + size += PAGE_SIZE; + } + list_del(&p->list); + kmmio_count--; + spin_unlock_irq(&kmmio_lock); +} + +/* + * According to 2.6.20, mainly x86_64 arch: + * This is being called from do_page_fault(), via the page fault notifier + * chain. The chain is called for both user space faults and kernel space + * faults (address >= TASK_SIZE64), except not on faults serviced by + * vmalloc_fault(). + * + * We may be in an interrupt or a critical section. Also prefecthing may + * trigger a page fault. We may be in the middle of process switch. + * The page fault hook functionality has put us inside RCU read lock. + * + * Local interrupts are disabled, so preemption cannot happen. + * Do not enable interrupts, do not sleep, and watch out for other CPUs. + */ +static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + if (is_kmmio_active()) + if (kmmio_handler(regs, address) == 1) + return -1; + return 0; +} + +static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, + void *args) +{ + struct die_args *arg = args; + + if (val == DIE_DEBUG) + if (post_kmmio_handler(arg->err, arg->regs) == 1) + return NOTIFY_STOP; + + return NOTIFY_DONE; +} diff --git a/arch/x86/kernel/mmiotrace/kmmio.h b/arch/x86/kernel/mmiotrace/kmmio.h new file mode 100644 index 00000000000..85b7f68a3b8 --- /dev/null +++ b/arch/x86/kernel/mmiotrace/kmmio.h @@ -0,0 +1,58 @@ +#ifndef _LINUX_KMMIO_H +#define _LINUX_KMMIO_H + +#include +#include +#include +#include +#include +#include +#include + +struct kmmio_probe; +struct kmmio_fault_page; +struct pt_regs; + +typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *, + struct pt_regs *, unsigned long addr); +typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, + unsigned long condition, struct pt_regs *); + +struct kmmio_probe { + struct list_head list; + + /* start location of the probe point */ + unsigned long addr; + + /* length of the probe region */ + unsigned long len; + + /* Called before addr is executed. */ + kmmio_pre_handler_t pre_handler; + + /* Called after addr is executed, unless... */ + kmmio_post_handler_t post_handler; +}; + +struct kmmio_fault_page { + struct list_head list; + + /* location of the fault page */ + unsigned long page; + + int count; +}; + +/* kmmio is active by some kmmio_probes? */ +static inline int is_kmmio_active(void) +{ + extern unsigned int kmmio_count; + return kmmio_count; +} + +int init_kmmio(void); +void cleanup_kmmio(void); +int register_kmmio_probe(struct kmmio_probe *p); +void unregister_kmmio_probe(struct kmmio_probe *p); + +#endif /* _LINUX_KMMIO_H */ diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c new file mode 100644 index 00000000000..73561fe85f0 --- /dev/null +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -0,0 +1,527 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005 + * Jeff Muizelaar, 2006, 2007 + * Pekka Paalanen, 2008 + * + * Derived from the read-mod example from relay-examples by Tom Zanussi. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ISA_START_ADDRESS */ + +#include "kmmio.h" +#include "pf_in.h" + +/* This app's relay channel files will appear in /debug/mmio-trace */ +#define APP_DIR "mmio-trace" +/* the marker injection file in /proc */ +#define MARKER_FILE "mmio-marker" + +#define MODULE_NAME "mmiotrace" + +struct trap_reason { + unsigned long addr; + unsigned long ip; + enum reason_type type; + int active_traces; +}; + +static struct trap_reason pf_reason[NR_CPUS]; +static struct mm_io_header_rw cpu_trace[NR_CPUS]; + +static struct file_operations mmio_fops = { + .owner = THIS_MODULE, +}; + +static const size_t subbuf_size = 256*1024; +static struct rchan *chan; +static struct dentry *dir; +static int suspended; /* XXX should this be per cpu? */ +static struct proc_dir_entry *proc_marker_file; + +/* module parameters */ +static unsigned int n_subbufs = 32*4; +static unsigned long filter_offset; +static int nommiotrace; +static int ISA_trace; +static int trace_pc; + +module_param(n_subbufs, uint, 0); +module_param(filter_offset, ulong, 0); +module_param(nommiotrace, bool, 0); +module_param(ISA_trace, bool, 0); +module_param(trace_pc, bool, 0); + +MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128."); +MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); +MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); +MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range."); +MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); + +static void record_timestamp(struct mm_io_header *header) +{ + struct timespec now; + + getnstimeofday(&now); + header->sec = now.tv_sec; + header->nsec = now.tv_nsec; +} + +/* + * Write callback for the /proc entry: + * Read a marker and write it to the mmio trace log + */ +static int write_marker(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char *event = NULL; + struct mm_io_header *headp; + int len = (count > 65535) ? 65535 : count; + + event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); + if (!event) + return -ENOMEM; + + headp = (struct mm_io_header *)event; + headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); + headp->data_len = len; + record_timestamp(headp); + + if (copy_from_user(event + sizeof(*headp), buffer, len)) { + kfree(event); + return -EFAULT; + } + + relay_write(chan, event, sizeof(*headp) + len); + kfree(event); + return len; +} + +static void print_pte(unsigned long address) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud = pud_offset(pgd, address); + pmd_t *pmd = pmd_offset(pud, address); + if (pmd_large(*pmd)) { + printk(KERN_EMERG MODULE_NAME ": 4MB pages are not " + "currently supported: %lx\n", + address); + BUG(); + } + printk(KERN_DEBUG MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n", + address, + pte_val(*pte_offset_kernel(pmd, address)), + pte_val(*pte_offset_kernel(pmd, address)) & _PAGE_PRESENT); +} + +/* + * For some reason the pre/post pairs have been called in an + * unmatched order. Report and die. + */ +static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) +{ + const unsigned long cpu = smp_processor_id(); + printk(KERN_EMERG MODULE_NAME ": unexpected fault for address: %lx, " + "last fault for address: %lx\n", + addr, pf_reason[cpu].addr); + print_pte(addr); +#ifdef __i386__ + print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip); + print_symbol(KERN_EMERG "last faulting EIP was at %s\n", + pf_reason[cpu].ip); + printk(KERN_EMERG + "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->ax, regs->bx, regs->cx, regs->dx); + printk(KERN_EMERG + "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#else + print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip); + print_symbol(KERN_EMERG "last faulting RIP was at %s\n", + pf_reason[cpu].ip); + printk(KERN_EMERG "rax: %016lx rcx: %016lx rdx: %016lx\n", + regs->ax, regs->cx, regs->dx); + printk(KERN_EMERG "rsi: %016lx rdi: %016lx " + "rbp: %016lx rsp: %016lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#endif + BUG(); +} + +static void pre(struct kmmio_probe *p, struct pt_regs *regs, + unsigned long addr) +{ + const unsigned long cpu = smp_processor_id(); + const unsigned long instptr = instruction_pointer(regs); + const enum reason_type type = get_ins_type(instptr); + + /* it doesn't make sense to have more than one active trace per cpu */ + if (pf_reason[cpu].active_traces) + die_kmmio_nesting_error(regs, addr); + else + pf_reason[cpu].active_traces++; + + pf_reason[cpu].type = type; + pf_reason[cpu].addr = addr; + pf_reason[cpu].ip = instptr; + + cpu_trace[cpu].header.type = MMIO_MAGIC; + cpu_trace[cpu].header.pid = 0; + cpu_trace[cpu].header.data_len = sizeof(struct mm_io_rw); + cpu_trace[cpu].rw.address = addr; + + /* + * Only record the program counter when requested. + * It may taint clean-room reverse engineering. + */ + if (trace_pc) + cpu_trace[cpu].rw.pc = instptr; + else + cpu_trace[cpu].rw.pc = 0; + + record_timestamp(&cpu_trace[cpu].header); + + switch (type) { + case REG_READ: + cpu_trace[cpu].header.type |= + (MMIO_READ << MMIO_OPCODE_SHIFT) | + (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); + break; + case REG_WRITE: + cpu_trace[cpu].header.type |= + (MMIO_WRITE << MMIO_OPCODE_SHIFT) | + (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); + cpu_trace[cpu].rw.value = get_ins_reg_val(instptr, regs); + break; + case IMM_WRITE: + cpu_trace[cpu].header.type |= + (MMIO_WRITE << MMIO_OPCODE_SHIFT) | + (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); + cpu_trace[cpu].rw.value = get_ins_imm_val(instptr); + break; + default: + { + unsigned char *ip = (unsigned char *)instptr; + cpu_trace[cpu].header.type |= + (MMIO_UNKNOWN_OP << MMIO_OPCODE_SHIFT); + cpu_trace[cpu].rw.value = (*ip) << 16 | + *(ip + 1) << 8 | + *(ip + 2); + } + } +} + +static void post(struct kmmio_probe *p, unsigned long condition, + struct pt_regs *regs) +{ + const unsigned long cpu = smp_processor_id(); + + /* this should always return the active_trace count to 0 */ + pf_reason[cpu].active_traces--; + if (pf_reason[cpu].active_traces) { + printk(KERN_EMERG MODULE_NAME ": unexpected post handler"); + BUG(); + } + + switch (pf_reason[cpu].type) { + case REG_READ: + cpu_trace[cpu].rw.value = get_ins_reg_val(pf_reason[cpu].ip, + regs); + break; + default: + break; + } + relay_write(chan, &cpu_trace[cpu], sizeof(struct mm_io_header_rw)); +} + +/* + * subbuf_start() relay callback. + * + * Defined so that we know when events are dropped due to the buffer-full + * condition. + */ +static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, + void *prev_subbuf, size_t prev_padding) +{ + if (relay_buf_full(buf)) { + if (!suspended) { + suspended = 1; + printk(KERN_ERR MODULE_NAME + ": cpu %d buffer full!!!\n", + smp_processor_id()); + } + return 0; + } else if (suspended) { + suspended = 0; + printk(KERN_ERR MODULE_NAME + ": cpu %d buffer no longer full.\n", + smp_processor_id()); + } + + return 1; +} + +/* file_create() callback. Creates relay file in debugfs. */ +static struct dentry *create_buf_file_handler(const char *filename, + struct dentry *parent, + int mode, + struct rchan_buf *buf, + int *is_global) +{ + struct dentry *buf_file; + + mmio_fops.read = relay_file_operations.read; + mmio_fops.open = relay_file_operations.open; + mmio_fops.poll = relay_file_operations.poll; + mmio_fops.mmap = relay_file_operations.mmap; + mmio_fops.release = relay_file_operations.release; + mmio_fops.splice_read = relay_file_operations.splice_read; + + buf_file = debugfs_create_file(filename, mode, parent, buf, + &mmio_fops); + + return buf_file; +} + +/* file_remove() default callback. Removes relay file in debugfs. */ +static int remove_buf_file_handler(struct dentry *dentry) +{ + debugfs_remove(dentry); + return 0; +} + +static struct rchan_callbacks relay_callbacks = { + .subbuf_start = subbuf_start_handler, + .create_buf_file = create_buf_file_handler, + .remove_buf_file = remove_buf_file_handler, +}; + +/* + * create_channel - creates channel /debug/APP_DIR/cpuXXX + * Returns channel on success, NULL otherwise + */ +static struct rchan *create_channel(unsigned size, unsigned n) +{ + return relay_open("cpu", dir, size, n, &relay_callbacks, NULL); +} + +/* destroy_channel - destroys channel /debug/APP_DIR/cpuXXX */ +static void destroy_channel(void) +{ + if (chan) { + relay_close(chan); + chan = NULL; + } +} + +struct remap_trace { + struct list_head list; + struct kmmio_probe probe; +}; +static LIST_HEAD(trace_list); +static DEFINE_SPINLOCK(trace_list_lock); + +static void do_ioremap_trace_core(unsigned long offset, unsigned long size, + void __iomem *addr) +{ + struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); + struct mm_io_header_map event = { + .header = { + .type = MMIO_MAGIC | + (MMIO_PROBE << MMIO_OPCODE_SHIFT), + .sec = 0, + .nsec = 0, + .pid = 0, + .data_len = sizeof(struct mm_io_map) + }, + .map = { + .phys = offset, + .addr = (unsigned long)addr, + .len = size, + .pc = 0 + } + }; + record_timestamp(&event.header); + + *trace = (struct remap_trace) { + .probe = { + .addr = (unsigned long)addr, + .len = size, + .pre_handler = pre, + .post_handler = post, + } + }; + + relay_write(chan, &event, sizeof(event)); + spin_lock(&trace_list_lock); + list_add_tail(&trace->list, &trace_list); + spin_unlock(&trace_list_lock); + if (!nommiotrace) + register_kmmio_probe(&trace->probe); +} + +static void ioremap_trace_core(unsigned long offset, unsigned long size, + void __iomem *addr) +{ + if ((filter_offset) && (offset != filter_offset)) + return; + + /* Don't trace the low PCI/ISA area, it's always mapped.. */ + if (!ISA_trace && (offset < ISA_END_ADDRESS) && + (offset + size > ISA_START_ADDRESS)) { + printk(KERN_NOTICE MODULE_NAME ": Ignoring map of low " + "PCI/ISA area (0x%lx-0x%lx)\n", + offset, offset + size); + return; + } + do_ioremap_trace_core(offset, size, addr); +} + +void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size) +{ + void __iomem *p = ioremap_cache(offset, size); + printk(KERN_DEBUG MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n", + offset, size, p); + ioremap_trace_core(offset, size, p); + return p; +} +EXPORT_SYMBOL(ioremap_cache_trace); + +void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size) +{ + void __iomem *p = ioremap_nocache(offset, size); + printk(KERN_DEBUG MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n", + offset, size, p); + ioremap_trace_core(offset, size, p); + return p; +} +EXPORT_SYMBOL(ioremap_nocache_trace); + +void iounmap_trace(volatile void __iomem *addr) +{ + struct mm_io_header_map event = { + .header = { + .type = MMIO_MAGIC | + (MMIO_UNPROBE << MMIO_OPCODE_SHIFT), + .sec = 0, + .nsec = 0, + .pid = 0, + .data_len = sizeof(struct mm_io_map) + }, + .map = { + .phys = 0, + .addr = (unsigned long)addr, + .len = 0, + .pc = 0 + } + }; + struct remap_trace *trace; + struct remap_trace *tmp; + printk(KERN_DEBUG MODULE_NAME ": Unmapping %p.\n", addr); + record_timestamp(&event.header); + + spin_lock(&trace_list_lock); + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + if ((unsigned long)addr == trace->probe.addr) { + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + list_del(&trace->list); + kfree(trace); + break; + } + } + spin_unlock(&trace_list_lock); + relay_write(chan, &event, sizeof(event)); + iounmap(addr); +} +EXPORT_SYMBOL(iounmap_trace); + +static void clear_trace_list(void) +{ + struct remap_trace *trace; + struct remap_trace *tmp; + + spin_lock(&trace_list_lock); + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + printk(KERN_WARNING MODULE_NAME ": purging non-iounmapped " + "trace @0x%08lx, size 0x%lx.\n", + trace->probe.addr, trace->probe.len); + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + list_del(&trace->list); + kfree(trace); + break; + } + spin_unlock(&trace_list_lock); +} + +static int __init init(void) +{ + if (n_subbufs < 2) + return -EINVAL; + + dir = debugfs_create_dir(APP_DIR, NULL); + if (!dir) { + printk(KERN_ERR MODULE_NAME + ": Couldn't create relay app directory.\n"); + return -ENOMEM; + } + + chan = create_channel(subbuf_size, n_subbufs); + if (!chan) { + debugfs_remove(dir); + printk(KERN_ERR MODULE_NAME + ": relay app channel creation failed\n"); + return -ENOMEM; + } + + init_kmmio(); + + proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL); + if (proc_marker_file) + proc_marker_file->write_proc = write_marker; + + printk(KERN_DEBUG MODULE_NAME ": loaded.\n"); + if (nommiotrace) + printk(KERN_DEBUG MODULE_NAME ": MMIO tracing disabled.\n"); + if (ISA_trace) + printk(KERN_WARNING MODULE_NAME + ": Warning! low ISA range will be traced.\n"); + return 0; +} + +static void __exit cleanup(void) +{ + printk(KERN_DEBUG MODULE_NAME ": unload...\n"); + clear_trace_list(); + cleanup_kmmio(); + remove_proc_entry(MARKER_FILE, NULL); + destroy_channel(); + if (dir) + debugfs_remove(dir); +} + +module_init(init); +module_exit(cleanup); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/mmiotrace/pf_in.c b/arch/x86/kernel/mmiotrace/pf_in.c new file mode 100644 index 00000000000..67ea520dde6 --- /dev/null +++ b/arch/x86/kernel/mmiotrace/pf_in.c @@ -0,0 +1,489 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +/* $Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp $ + * Copyright by Intel Crop., 2002 + * Louis Zhuang (louis.zhuang@intel.com) + * + * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007 + */ + +#include +#include /* struct pt_regs */ +#include "pf_in.h" + +#ifdef __i386__ +/* IA32 Manual 3, 2-1 */ +static unsigned char prefix_codes[] = { + 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, + 0x65, 0x2E, 0x3E, 0x66, 0x67 +}; +/* IA32 Manual 3, 3-432*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +/* IA32 Manual 3, 3-432*/ +static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 }; +static unsigned int rw32[] = { + 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F }; +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 }; +static unsigned int mw64[] = {}; +#else /* not __i386__ */ +static unsigned char prefix_codes[] = { + 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36, + 0xF0, 0xF3, 0xF2, + /* REX Prefixes */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f +}; +/* AMD64 Manual 3, Appendix A*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89 }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +static unsigned int rw8[] = { 0xC6, 0x88, 0x8A }; +static unsigned int rw32[] = { + 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +/* 8 bit only */ +static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F }; +/* 16 bit only */ +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +/* 16 or 32 bit */ +static unsigned int mw32[] = { 0xC7 }; +/* 16, 32 or 64 bit */ +static unsigned int mw64[] = { 0x89, 0x8B }; +#endif /* not __i386__ */ + +static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, + int *rexr) +{ + int i; + unsigned char *p = addr; + *shorted = 0; + *enlarged = 0; + *rexr = 0; + +restart: + for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { + if (*p == prefix_codes[i]) { + if (*p == 0x66) + *shorted = 1; +#ifdef __amd64__ + if ((*p & 0xf8) == 0x48) + *enlarged = 1; + if ((*p & 0xf4) == 0x44) + *rexr = 1; +#endif + p++; + goto restart; + } + } + + return (p - addr); +} + +static int get_opcode(unsigned char *addr, unsigned int *opcode) +{ + int len; + + if (*addr == 0x0F) { + /* 0x0F is extension instruction */ + *opcode = *(unsigned short *)addr; + len = 2; + } else { + *opcode = *addr; + len = 1; + } + + return len; +} + +#define CHECK_OP_TYPE(opcode, array, type) \ + for (i = 0; i < ARRAY_SIZE(array); i++) { \ + if (array[i] == opcode) { \ + rv = type; \ + goto exit; \ + } \ + } + +enum reason_type get_ins_type(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int shorted, enlarged, rexr; + int i; + enum reason_type rv = OTHERS; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + CHECK_OP_TYPE(opcode, reg_rop, REG_READ); + CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE); + CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE); + +exit: + return rv; +} +#undef CHECK_OP_TYPE + +static unsigned int get_ins_reg_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int i, shorted, enlarged, rexr; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(rw8); i++) + if (rw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(rw32); i++) + if (rw32[i] == opcode) + return (shorted ? 2 : (enlarged ? 8 : 4)); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +unsigned int get_ins_mem_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + int i, shorted, enlarged, rexr; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(mw8); i++) + if (mw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(mw16); i++) + if (mw16[i] == opcode) + return 2; + + for (i = 0; i < ARRAY_SIZE(mw32); i++) + if (mw32[i] == opcode) + return shorted ? 2 : 4; + + for (i = 0; i < ARRAY_SIZE(mw64); i++) + if (mw64[i] == opcode) + return shorted ? 2 : (enlarged ? 8 : 4); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +/* + * Define register ident in mod/rm byte. + * Note: these are NOT the same as in ptrace-abi.h. + */ +enum { + arg_AL = 0, + arg_CL = 1, + arg_DL = 2, + arg_BL = 3, + arg_AH = 4, + arg_CH = 5, + arg_DH = 6, + arg_BH = 7, + + arg_AX = 0, + arg_CX = 1, + arg_DX = 2, + arg_BX = 3, + arg_SP = 4, + arg_BP = 5, + arg_SI = 6, + arg_DI = 7, +#ifdef __amd64__ + arg_R8 = 8, + arg_R9 = 9, + arg_R10 = 10, + arg_R11 = 11, + arg_R12 = 12, + arg_R13 = 13, + arg_R14 = 14, + arg_R15 = 15 +#endif +}; + +static unsigned char *get_reg_w8(int no, struct pt_regs *regs) +{ + unsigned char *rv = NULL; + + switch (no) { + case arg_AL: + rv = (unsigned char *)®s->ax; + break; + case arg_BL: + rv = (unsigned char *)®s->bx; + break; + case arg_CL: + rv = (unsigned char *)®s->cx; + break; + case arg_DL: + rv = (unsigned char *)®s->dx; + break; + case arg_AH: + rv = 1 + (unsigned char *)®s->ax; + break; + case arg_BH: + rv = 1 + (unsigned char *)®s->bx; + break; + case arg_CH: + rv = 1 + (unsigned char *)®s->cx; + break; + case arg_DH: + rv = 1 + (unsigned char *)®s->dx; + break; +#ifdef __amd64__ + case arg_R8: + rv = (unsigned char *)®s->r8; + break; + case arg_R9: + rv = (unsigned char *)®s->r9; + break; + case arg_R10: + rv = (unsigned char *)®s->r10; + break; + case arg_R11: + rv = (unsigned char *)®s->r11; + break; + case arg_R12: + rv = (unsigned char *)®s->r12; + break; + case arg_R13: + rv = (unsigned char *)®s->r13; + break; + case arg_R14: + rv = (unsigned char *)®s->r14; + break; + case arg_R15: + rv = (unsigned char *)®s->r15; + break; +#endif + default: + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + break; + } + return rv; +} + +static unsigned long *get_reg_w32(int no, struct pt_regs *regs) +{ + unsigned long *rv = NULL; + + switch (no) { + case arg_AX: + rv = ®s->ax; + break; + case arg_BX: + rv = ®s->bx; + break; + case arg_CX: + rv = ®s->cx; + break; + case arg_DX: + rv = ®s->dx; + break; + case arg_SP: + rv = ®s->sp; + break; + case arg_BP: + rv = ®s->bp; + break; + case arg_SI: + rv = ®s->si; + break; + case arg_DI: + rv = ®s->di; + break; +#ifdef __amd64__ + case arg_R8: + rv = ®s->r8; + break; + case arg_R9: + rv = ®s->r9; + break; + case arg_R10: + rv = ®s->r10; + break; + case arg_R11: + rv = ®s->r11; + break; + case arg_R12: + rv = ®s->r12; + break; + case arg_R13: + rv = ®s->r13; + break; + case arg_R14: + rv = ®s->r14; + break; + case arg_R15: + rv = ®s->r15; + break; +#endif + default: + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + } + + return rv; +} + +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) +{ + unsigned int opcode; + unsigned char mod_rm; + int reg; + unsigned char *p; + int i, shorted, enlarged, rexr; + unsigned long rv; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(reg_rop); i++) + if (reg_rop[i] == opcode) { + rv = REG_READ; + goto do_work; + } + + for (i = 0; i < ARRAY_SIZE(reg_wop); i++) + if (reg_wop[i] == opcode) { + rv = REG_WRITE; + goto do_work; + } + + printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + mod_rm = *p; + reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *get_reg_w8(reg, regs); + + case 2: + return *(unsigned short *)get_reg_w32(reg, regs); + + case 4: + return *(unsigned int *)get_reg_w32(reg, regs); + +#ifdef __amd64__ + case 8: + return *(unsigned long *)get_reg_w32(reg, regs); +#endif + + default: + printk(KERN_ERR "mmiotrace: Error width# %d\n", reg); + } + +err: + return 0; +} + +unsigned long get_ins_imm_val(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char mod_rm; + unsigned char mod; + unsigned char *p; + int i, shorted, enlarged, rexr; + unsigned long rv; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(imm_wop); i++) + if (imm_wop[i] == opcode) { + rv = IMM_WRITE; + goto do_work; + } + + printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + mod_rm = *p; + mod = mod_rm >> 6; + p++; + switch (mod) { + case 0: + /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */ + /* AMD64: XXX Check for address size prefix? */ + if ((mod_rm & 0x7) == 0x5) + p += 4; + break; + + case 1: + p += 1; + break; + + case 2: + p += 4; + break; + + case 3: + default: + printk(KERN_ERR "mmiotrace: not a memory access instruction " + "at 0x%lx, rm_mod=0x%02x\n", + ins_addr, mod_rm); + } + + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *(unsigned char *)p; + + case 2: + return *(unsigned short *)p; + + case 4: + return *(unsigned int *)p; + +#ifdef __amd64__ + case 8: + return *(unsigned long *)p; +#endif + + default: + printk(KERN_ERR "mmiotrace: Error: width.\n"); + } + +err: + return 0; +} diff --git a/arch/x86/kernel/mmiotrace/pf_in.h b/arch/x86/kernel/mmiotrace/pf_in.h new file mode 100644 index 00000000000..e05341a51a2 --- /dev/null +++ b/arch/x86/kernel/mmiotrace/pf_in.h @@ -0,0 +1,39 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +#ifndef __PF_H_ +#define __PF_H_ + +enum reason_type { + NOT_ME, /* page fault is not in regions */ + NOTHING, /* access others point in regions */ + REG_READ, /* read from addr to reg */ + REG_WRITE, /* write from reg to addr */ + IMM_WRITE, /* write from imm to addr */ + OTHERS /* Other instructions can not intercept */ +}; + +enum reason_type get_ins_type(unsigned long ins_addr); +unsigned int get_ins_mem_width(unsigned long ins_addr); +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs); +unsigned long get_ins_imm_val(unsigned long ins_addr); + +#endif /* __PF_H_ */ diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c new file mode 100644 index 00000000000..40e66b0e648 --- /dev/null +++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c @@ -0,0 +1,77 @@ +/* + * Written by Pekka Paalanen, 2008 + */ +#include +#include + +extern void __iomem *ioremap_nocache_trace(unsigned long offset, + unsigned long size); +extern void iounmap_trace(volatile void __iomem *addr); + +#define MODULE_NAME "testmmiotrace" + +static unsigned long mmio_address; +module_param(mmio_address, ulong, 0); +MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); + +static void do_write_test(void __iomem *p) +{ + unsigned int i; + for (i = 0; i < 256; i++) + iowrite8(i, p + i); + for (i = 1024; i < (5 * 1024); i += 2) + iowrite16(i * 12 + 7, p + i); + for (i = (5 * 1024); i < (16 * 1024); i += 4) + iowrite32(i * 212371 + 13, p + i); +} + +static void do_read_test(void __iomem *p) +{ + unsigned int i; + volatile unsigned int v; + for (i = 0; i < 256; i++) + v = ioread8(p + i); + for (i = 1024; i < (5 * 1024); i += 2) + v = ioread16(p + i); + for (i = (5 * 1024); i < (16 * 1024); i += 4) + v = ioread32(p + i); +} + +static void do_test(void) +{ + void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000); + if (!p) { + printk(KERN_ERR MODULE_NAME ": could not ioremap IO memory, " + "aborting.\n"); + return; + } + do_write_test(p); + do_read_test(p); + iounmap_trace(p); +} + +static int __init init(void) +{ + if (mmio_address == 0) { + printk(KERN_ERR MODULE_NAME ": you have to use the module " + "argument mmio_address.\n"); + printk(KERN_ERR MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" + " YOU REALLY KNOW WHAT YOU ARE DOING!\n"); + return -ENXIO; + } + + printk(KERN_WARNING MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " + "in PCI address space, and writing " + "rubbish in there.\n", mmio_address); + do_test(); + return 0; +} + +static void __exit cleanup(void) +{ + printk(KERN_DEBUG MODULE_NAME ": unloaded.\n"); +} + +module_init(init); +module_exit(cleanup); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h new file mode 100644 index 00000000000..cb247825f3e --- /dev/null +++ b/include/linux/mmiotrace.h @@ -0,0 +1,62 @@ +#ifndef MMIOTRACE_H +#define MMIOTRACE_H + +#include + +#define MMIO_VERSION 0x04 + +/* mm_io_header.type */ +#define MMIO_OPCODE_MASK 0xff +#define MMIO_OPCODE_SHIFT 0 +#define MMIO_WIDTH_MASK 0xff00 +#define MMIO_WIDTH_SHIFT 8 +#define MMIO_MAGIC (0x6f000000 | (MMIO_VERSION<<16)) +#define MMIO_MAGIC_MASK 0xffff0000 + +enum mm_io_opcode { /* payload type: */ + MMIO_READ = 0x1, /* struct mm_io_rw */ + MMIO_WRITE = 0x2, /* struct mm_io_rw */ + MMIO_PROBE = 0x3, /* struct mm_io_map */ + MMIO_UNPROBE = 0x4, /* struct mm_io_map */ + MMIO_MARKER = 0x5, /* raw char data */ + MMIO_UNKNOWN_OP = 0x6, /* struct mm_io_rw */ +}; + +struct mm_io_header { + __u32 type; + __u32 sec; /* timestamp */ + __u32 nsec; + __u32 pid; /* PID of the process, or 0 for kernel core */ + __u16 data_len; /* length of the following payload */ +}; + +struct mm_io_rw { + __u64 address; /* virtual address of register */ + __u64 value; + __u64 pc; /* optional program counter */ +}; + +struct mm_io_map { + __u64 phys; /* base address in PCI space */ + __u64 addr; /* base virtual address */ + __u64 len; /* mapping size */ + __u64 pc; /* optional program counter */ +}; + + +/* + * These structures are used to allow a single relay_write() + * call to write a full packet. + */ + +struct mm_io_header_rw { + struct mm_io_header header; + struct mm_io_rw rw; +} __attribute__((packed)); + +struct mm_io_header_map { + struct mm_io_header header; + struct mm_io_map map; +} __attribute__((packed)); + +#endif /* MMIOTRACE_H */ -- cgit v1.2.3-70-g09d2 From 63ffa3e456c1a9884a3ebac997d91e3fdae18d78 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: x86 mmiotrace: comment about user space ABI Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/mmiotrace.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index cb247825f3e..6ec288f1fe2 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -3,6 +3,10 @@ #include +/* + * If you change anything here, you must bump MMIO_VERSION. + * This is the relay data format for user space. + */ #define MMIO_VERSION 0x04 /* mm_io_header.type */ @@ -23,7 +27,7 @@ enum mm_io_opcode { /* payload type: */ }; struct mm_io_header { - __u32 type; + __u32 type; /* see MMIO_* macros above */ __u32 sec; /* timestamp */ __u32 nsec; __u32 pid; /* PID of the process, or 0 for kernel core */ -- cgit v1.2.3-70-g09d2 From 0fd0e3da4557c479b820b9a4a7afa25b4637ddf2 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: x86: mmiotrace full patch, preview 1 kmmio.c handles the list of mmio probes with callbacks, list of traced pages, and attaching into the page fault handler and die notifier. It arms, traps and disarms the given pages, this is the core of mmiotrace. mmio-mod.c is a user interface, hooking into ioremap functions and registering the mmio probes. It also decodes the required information from trapped mmio accesses via the pre and post callbacks in each probe. Currently, hooking into ioremap functions works by redefining the symbols of the target (binary) kernel module, so that it calls the traced versions of the functions. The most notable changes done since the last discussion are: - kmmio.c is a built-in, not part of the module - direct call from fault.c to kmmio.c, removing all dynamic hooks - prepare for unregistering probes at any time - make kmmio re-initializable and accessible to more than one user - rewrite kmmio locking to remove all spinlocks from page fault path Can I abuse call_rcu() like I do in kmmio.c:unregister_kmmio_probe() or is there a better way? The function called via call_rcu() itself calls call_rcu() again, will this work or break? There I need a second grace period for RCU after the first grace period for page faults. Mmiotrace itself (mmio-mod.c) is still a module, I am going to attack that next. At some point I will start looking into how to make mmiotrace a tracer component of ftrace (thanks for the hint, Ingo). Ftrace should make the user space part of mmiotracing as simple as 'cat /debug/trace/mmio > dump.txt'. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/init_task.c | 1 - arch/x86/kernel/mmiotrace/Makefile | 8 +- arch/x86/kernel/mmiotrace/kmmio.c | 349 ++++++++++++++++++++---------- arch/x86/kernel/mmiotrace/kmmio.h | 58 ----- arch/x86/kernel/mmiotrace/mmio-mod.c | 81 ++++--- arch/x86/kernel/mmiotrace/pf_in.c | 2 +- arch/x86/kernel/mmiotrace/testmmiotrace.c | 13 +- arch/x86/mm/fault.c | 59 +---- include/asm-x86/kdebug.h | 7 - include/linux/mmiotrace.h | 38 ++++ 10 files changed, 335 insertions(+), 281 deletions(-) delete mode 100644 arch/x86/kernel/mmiotrace/kmmio.h (limited to 'include/linux') diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 027a5b6a12b..a4f93b4120c 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -15,7 +15,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ -EXPORT_SYMBOL_GPL(init_mm); /* * Initial thread structure. diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile index d6905f7f981..cf1e747b463 100644 --- a/arch/x86/kernel/mmiotrace/Makefile +++ b/arch/x86/kernel/mmiotrace/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_MMIOTRACE) += mmiotrace.o -mmiotrace-objs := pf_in.o kmmio.o mmio-mod.o - -obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o +obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o +obj-$(CONFIG_MMIOTRACE) += mmiotrace.o +mmiotrace-objs := pf_in.o mmio-mod.o +obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c index 5e239d0b846..539a9b19588 100644 --- a/arch/x86/kernel/mmiotrace/kmmio.c +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -17,70 +18,119 @@ #include #include #include +#include #include #include #include #include #include -#include "kmmio.h" +#include -#define KMMIO_HASH_BITS 6 -#define KMMIO_TABLE_SIZE (1 << KMMIO_HASH_BITS) #define KMMIO_PAGE_HASH_BITS 4 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) +struct kmmio_fault_page { + struct list_head list; + struct kmmio_fault_page *release_next; + unsigned long page; /* location of the fault page */ + + /* + * Number of times this page has been registered as a part + * of a probe. If zero, page is disarmed and this may be freed. + * Used only by writers (RCU). + */ + int count; +}; + +struct kmmio_delayed_release { + struct rcu_head rcu; + struct kmmio_fault_page *release_list; +}; + struct kmmio_context { struct kmmio_fault_page *fpage; struct kmmio_probe *probe; unsigned long saved_flags; + unsigned long addr; int active; }; -static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code, - unsigned long address); static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args); +static DECLARE_MUTEX(kmmio_init_mutex); static DEFINE_SPINLOCK(kmmio_lock); /* These are protected by kmmio_lock */ +static int kmmio_initialized; unsigned int kmmio_count; -static unsigned int handler_registered; + +/* Read-protected by RCU, write-protected by kmmio_lock. */ static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; static LIST_HEAD(kmmio_probes); +static struct list_head *kmmio_page_list(unsigned long page) +{ + return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; +} + /* Accessed per-cpu */ static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); +/* protected by kmmio_init_mutex */ static struct notifier_block nb_die = { .notifier_call = kmmio_die_notifier }; -int init_kmmio(void) +/** + * Makes sure kmmio is initialized and usable. + * This must be called before any other kmmio function defined here. + * May sleep. + */ +void reference_kmmio(void) { - int i; - for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) - INIT_LIST_HEAD(&kmmio_page_table[i]); - - register_die_notifier(&nb_die); - return 0; + down(&kmmio_init_mutex); + spin_lock_irq(&kmmio_lock); + if (!kmmio_initialized) { + int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kmmio_page_table[i]); + if (register_die_notifier(&nb_die)) + BUG(); + } + kmmio_initialized++; + spin_unlock_irq(&kmmio_lock); + up(&kmmio_init_mutex); } +EXPORT_SYMBOL_GPL(reference_kmmio); -void cleanup_kmmio(void) +/** + * Clean up kmmio after use. This must be called for every call to + * reference_kmmio(). All probes registered after the corresponding + * reference_kmmio() must have been unregistered when calling this. + * May sleep. + */ +void unreference_kmmio(void) { - /* - * Assume the following have been already cleaned by calling - * unregister_kmmio_probe() appropriately: - * kmmio_page_table, kmmio_probes - */ - if (handler_registered) { - if (mmiotrace_unregister_pf(&kmmio_page_fault)) - BUG(); - synchronize_rcu(); + bool unreg = false; + + down(&kmmio_init_mutex); + spin_lock_irq(&kmmio_lock); + + if (kmmio_initialized == 1) { + BUG_ON(is_kmmio_active()); + unreg = true; } - unregister_die_notifier(&nb_die); + kmmio_initialized--; + BUG_ON(kmmio_initialized < 0); + spin_unlock_irq(&kmmio_lock); + + if (unreg) + unregister_die_notifier(&nb_die); /* calls sync_rcu() */ + up(&kmmio_init_mutex); } +EXPORT_SYMBOL(unreference_kmmio); /* * this is basically a dynamic stabbing problem: @@ -90,33 +140,33 @@ void cleanup_kmmio(void) * Overlap a Point (might be simple) * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup */ -/* Get the kmmio at this addr (if any). You must be holding kmmio_lock. */ +/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ static struct kmmio_probe *get_kmmio_probe(unsigned long addr) { struct kmmio_probe *p; - list_for_each_entry(p, &kmmio_probes, list) { + list_for_each_entry_rcu(p, &kmmio_probes, list) { if (addr >= p->addr && addr <= (p->addr + p->len)) return p; } return NULL; } +/* You must be holding RCU read lock. */ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) { - struct list_head *head, *tmp; + struct list_head *head; + struct kmmio_fault_page *p; page &= PAGE_MASK; - head = &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; - list_for_each(tmp, head) { - struct kmmio_fault_page *p - = list_entry(tmp, struct kmmio_fault_page, list); + head = kmmio_page_list(page); + list_for_each_entry_rcu(p, head, list) { if (p->page == page) return p; } - return NULL; } +/** Mark the given page as not present. Access to it will trigger a fault. */ static void arm_kmmio_fault_page(unsigned long page, int *page_level) { unsigned long address = page & PAGE_MASK; @@ -124,8 +174,8 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level) pte_t *pte = lookup_address(address, &level); if (!pte) { - printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n", - __FUNCTION__, page); + pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n", + __func__, page); return; } @@ -143,6 +193,7 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level) __flush_tlb_one(page); } +/** Mark the given page as present. */ static void disarm_kmmio_fault_page(unsigned long page, int *page_level) { unsigned long address = page & PAGE_MASK; @@ -150,8 +201,8 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level) pte_t *pte = lookup_address(address, &level); if (!pte) { - printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n", - __FUNCTION__, page); + pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n", + __func__, page); return; } @@ -169,13 +220,25 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level) __flush_tlb_one(page); } +/* + * This is being called from do_page_fault(). + * + * We may be in an interrupt or a critical section. Also prefecthing may + * trigger a page fault. We may be in the middle of process switch. + * We cannot take any locks, because we could be executing especially + * within a kmmio critical section. + * + * Local interrupts are disabled, so preemption cannot happen. + * Do not enable interrupts, do not sleep, and watch out for other CPUs. + */ /* * Interrupts are disabled on entry as trap3 is an interrupt gate * and they remain disabled thorough out this function. */ -static int kmmio_handler(struct pt_regs *regs, unsigned long addr) +int kmmio_handler(struct pt_regs *regs, unsigned long addr) { - struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); + struct kmmio_context *ctx; + struct kmmio_fault_page *faultpage; /* * Preemption is now disabled to prevent process switch during @@ -186,40 +249,40 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr) * XXX what if an interrupt occurs between returning from * do_page_fault() and entering the single-step exception handler? * And that interrupt triggers a kmmio trap? + * XXX If we tracing an interrupt service routine or whatever, is + * this enough to keep it on the current cpu? */ preempt_disable(); - /* interrupts disabled and CPU-local data => atomicity guaranteed. */ + rcu_read_lock(); + faultpage = get_kmmio_fault_page(addr); + if (!faultpage) { + /* + * Either this page fault is not caused by kmmio, or + * another CPU just pulled the kmmio probe from under + * our feet. In the latter case all hell breaks loose. + */ + goto no_kmmio; + } + + ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { /* - * This avoids a deadlock with kmmio_lock. + * Prevent overwriting already in-flight context. * If this page fault really was due to kmmio trap, * all hell breaks loose. */ - printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, " - "for address %lu. Ignoring.\n", + pr_emerg("kmmio: recursive probe hit on CPU %d, " + "for address 0x%08lx. Ignoring.\n", smp_processor_id(), addr); - goto no_kmmio; + goto no_kmmio_ctx; } ctx->active++; - /* - * Acquire the kmmio lock to prevent changes affecting - * get_kmmio_fault_page() and get_kmmio_probe(), since we save their - * returned pointers. - * The lock is released in post_kmmio_handler(). - * XXX: could/should get_kmmio_*() be using RCU instead of spinlock? - */ - spin_lock(&kmmio_lock); - - ctx->fpage = get_kmmio_fault_page(addr); - if (!ctx->fpage) { - /* this page fault is not caused by kmmio */ - goto no_kmmio_locked; - } - + ctx->fpage = faultpage; ctx->probe = get_kmmio_probe(addr); ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK)); + ctx->addr = addr; if (ctx->probe && ctx->probe->pre_handler) ctx->probe->pre_handler(ctx->probe, regs, addr); @@ -227,46 +290,62 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr) regs->flags |= TF_MASK; regs->flags &= ~IF_MASK; - /* We hold lock, now we set present bit in PTE and single step. */ + /* Now we set present bit in PTE and single step. */ disarm_kmmio_fault_page(ctx->fpage->page, NULL); put_cpu_var(kmmio_ctx); + rcu_read_unlock(); return 1; -no_kmmio_locked: - spin_unlock(&kmmio_lock); - ctx->active--; +no_kmmio_ctx: + put_cpu_var(kmmio_ctx); no_kmmio: + rcu_read_unlock(); preempt_enable_no_resched(); - put_cpu_var(kmmio_ctx); - /* page fault not handled by kmmio */ - return 0; + return 0; /* page fault not handled by kmmio */ } /* * Interrupts are disabled on entry as trap1 is an interrupt gate * and they remain disabled thorough out this function. - * And we hold kmmio lock. + * This must always get called as the pair to kmmio_handler(). */ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) { int ret = 0; + struct kmmio_probe *probe; + struct kmmio_fault_page *faultpage; struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) goto out; + rcu_read_lock(); + + faultpage = get_kmmio_fault_page(ctx->addr); + probe = get_kmmio_probe(ctx->addr); + if (faultpage != ctx->fpage || probe != ctx->probe) { + /* + * The trace setup changed after kmmio_handler() and before + * running this respective post handler. User does not want + * the result anymore. + */ + ctx->probe = NULL; + ctx->fpage = NULL; + } + if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); - arm_kmmio_fault_page(ctx->fpage->page, NULL); + if (ctx->fpage) + arm_kmmio_fault_page(ctx->fpage->page, NULL); regs->flags &= ~TF_MASK; regs->flags |= ctx->saved_flags; /* These were acquired in kmmio_handler(). */ ctx->active--; - spin_unlock(&kmmio_lock); + BUG_ON(ctx->active); preempt_enable_no_resched(); /* @@ -277,11 +356,13 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) if (!(regs->flags & TF_MASK)) ret = 1; + rcu_read_unlock(); out: put_cpu_var(kmmio_ctx); return ret; } +/* You must be holding kmmio_lock. */ static int add_kmmio_fault_page(unsigned long page) { struct kmmio_fault_page *f; @@ -289,6 +370,8 @@ static int add_kmmio_fault_page(unsigned long page) page &= PAGE_MASK; f = get_kmmio_fault_page(page); if (f) { + if (!f->count) + arm_kmmio_fault_page(f->page, NULL); f->count++; return 0; } @@ -299,15 +382,16 @@ static int add_kmmio_fault_page(unsigned long page) f->count = 1; f->page = page; - list_add(&f->list, - &kmmio_page_table[hash_long(f->page, KMMIO_PAGE_HASH_BITS)]); + list_add_rcu(&f->list, kmmio_page_list(f->page)); arm_kmmio_fault_page(f->page, NULL); return 0; } -static void release_kmmio_fault_page(unsigned long page) +/* You must be holding kmmio_lock. */ +static void release_kmmio_fault_page(unsigned long page, + struct kmmio_fault_page **release_list) { struct kmmio_fault_page *f; @@ -317,9 +401,11 @@ static void release_kmmio_fault_page(unsigned long page) return; f->count--; + BUG_ON(f->count < 0); if (!f->count) { disarm_kmmio_fault_page(f->page, NULL); - list_del(&f->list); + f->release_next = *release_list; + *release_list = f; } } @@ -334,68 +420,113 @@ int register_kmmio_probe(struct kmmio_probe *p) ret = -EEXIST; goto out; } - list_add(&p->list, &kmmio_probes); - /*printk("adding fault pages...\n");*/ + list_add_rcu(&p->list, &kmmio_probes); while (size < p->len) { if (add_kmmio_fault_page(p->addr + size)) - printk(KERN_ERR "mmio: Unable to set page fault.\n"); + pr_err("kmmio: Unable to set page fault.\n"); size += PAGE_SIZE; } - - if (!handler_registered) { - if (mmiotrace_register_pf(&kmmio_page_fault)) - printk(KERN_ERR "mmiotrace: Cannot register page " - "fault handler.\n"); - else - handler_registered++; - } - out: spin_unlock_irq(&kmmio_lock); /* * XXX: What should I do here? * Here was a call to global_flush_tlb(), but it does not exist - * anymore. + * anymore. It seems it's not needed after all. */ return ret; } +EXPORT_SYMBOL(register_kmmio_probe); +static void rcu_free_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = container_of( + head, + struct kmmio_delayed_release, + rcu); + struct kmmio_fault_page *p = dr->release_list; + while (p) { + struct kmmio_fault_page *next = p->release_next; + BUG_ON(p->count); + kfree(p); + p = next; + } + kfree(dr); +} + +static void remove_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = container_of( + head, + struct kmmio_delayed_release, + rcu); + struct kmmio_fault_page *p = dr->release_list; + struct kmmio_fault_page **prevp = &dr->release_list; + unsigned long flags; + spin_lock_irqsave(&kmmio_lock, flags); + while (p) { + if (!p->count) + list_del_rcu(&p->list); + else + *prevp = p->release_next; + prevp = &p->release_next; + p = p->release_next; + } + spin_unlock_irqrestore(&kmmio_lock, flags); + /* This is the real RCU destroy call. */ + call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); +} + +/* + * Remove a kmmio probe. You have to synchronize_rcu() before you can be + * sure that the callbacks will not be called anymore. + * + * Unregistering a kmmio fault page has three steps: + * 1. release_kmmio_fault_page() + * Disarm the page, wait a grace period to let all faults finish. + * 2. remove_kmmio_fault_pages() + * Remove the pages from kmmio_page_table. + * 3. rcu_free_kmmio_fault_pages() + * Actally free the kmmio_fault_page structs as with RCU. + */ void unregister_kmmio_probe(struct kmmio_probe *p) { unsigned long size = 0; + struct kmmio_fault_page *release_list = NULL; + struct kmmio_delayed_release *drelease; spin_lock_irq(&kmmio_lock); while (size < p->len) { - release_kmmio_fault_page(p->addr + size); + release_kmmio_fault_page(p->addr + size, &release_list); size += PAGE_SIZE; } - list_del(&p->list); + list_del_rcu(&p->list); kmmio_count--; spin_unlock_irq(&kmmio_lock); -} -/* - * According to 2.6.20, mainly x86_64 arch: - * This is being called from do_page_fault(), via the page fault notifier - * chain. The chain is called for both user space faults and kernel space - * faults (address >= TASK_SIZE64), except not on faults serviced by - * vmalloc_fault(). - * - * We may be in an interrupt or a critical section. Also prefecthing may - * trigger a page fault. We may be in the middle of process switch. - * The page fault hook functionality has put us inside RCU read lock. - * - * Local interrupts are disabled, so preemption cannot happen. - * Do not enable interrupts, do not sleep, and watch out for other CPUs. - */ -static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code, - unsigned long address) -{ - if (is_kmmio_active()) - if (kmmio_handler(regs, address) == 1) - return -1; - return 0; + drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); + if (!drelease) { + pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); + return; + } + drelease->release_list = release_list; + + /* + * This is not really RCU here. We have just disarmed a set of + * pages so that they cannot trigger page faults anymore. However, + * we cannot remove the pages from kmmio_page_table, + * because a probe hit might be in flight on another CPU. The + * pages are collected into a list, and they will be removed from + * kmmio_page_table when it is certain that no probe hit related to + * these pages can be in flight. RCU grace period sounds like a + * good choice. + * + * If we removed the pages too early, kmmio page fault handler might + * not find the respective kmmio_fault_page and determine it's not + * a kmmio fault, when it actually is. This would lead to madness. + */ + call_rcu(&drelease->rcu, remove_kmmio_fault_pages); } +EXPORT_SYMBOL(unregister_kmmio_probe); static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) diff --git a/arch/x86/kernel/mmiotrace/kmmio.h b/arch/x86/kernel/mmiotrace/kmmio.h deleted file mode 100644 index 85b7f68a3b8..00000000000 --- a/arch/x86/kernel/mmiotrace/kmmio.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef _LINUX_KMMIO_H -#define _LINUX_KMMIO_H - -#include -#include -#include -#include -#include -#include -#include - -struct kmmio_probe; -struct kmmio_fault_page; -struct pt_regs; - -typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *, - struct pt_regs *, unsigned long addr); -typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, - unsigned long condition, struct pt_regs *); - -struct kmmio_probe { - struct list_head list; - - /* start location of the probe point */ - unsigned long addr; - - /* length of the probe region */ - unsigned long len; - - /* Called before addr is executed. */ - kmmio_pre_handler_t pre_handler; - - /* Called after addr is executed, unless... */ - kmmio_post_handler_t post_handler; -}; - -struct kmmio_fault_page { - struct list_head list; - - /* location of the fault page */ - unsigned long page; - - int count; -}; - -/* kmmio is active by some kmmio_probes? */ -static inline int is_kmmio_active(void) -{ - extern unsigned int kmmio_count; - return kmmio_count; -} - -int init_kmmio(void); -void cleanup_kmmio(void); -int register_kmmio_probe(struct kmmio_probe *p); -void unregister_kmmio_probe(struct kmmio_probe *p); - -#endif /* _LINUX_KMMIO_H */ diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index f9c609266d8..e1a508588f0 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -32,7 +32,6 @@ #include #include -#include "kmmio.h" #include "pf_in.h" /* This app's relay channel files will appear in /debug/mmio-trace */ @@ -129,18 +128,17 @@ static void print_pte(unsigned long address) pte_t *pte = lookup_address(address, &level); if (!pte) { - printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n", - __FUNCTION__, address); + pr_err(MODULE_NAME ": Error in %s: no pte for page 0x%08lx\n", + __func__, address); return; } if (level == PG_LEVEL_2M) { - printk(KERN_EMERG MODULE_NAME ": 4MB pages are not " - "currently supported: %lx\n", - address); + pr_emerg(MODULE_NAME ": 4MB pages are not currently " + "supported: %lx\n", address); BUG(); } - printk(KERN_DEBUG MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n", + pr_info(MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte), pte_val(*pte) & _PAGE_PRESENT); } @@ -152,7 +150,7 @@ static void print_pte(unsigned long address) static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) { const struct trap_reason *my_reason = &get_cpu_var(pf_reason); - printk(KERN_EMERG MODULE_NAME ": unexpected fault for address: %lx, " + pr_emerg(MODULE_NAME ": unexpected fault for address: %lx, " "last fault for address: %lx\n", addr, my_reason->addr); print_pte(addr); @@ -160,20 +158,17 @@ static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip); print_symbol(KERN_EMERG "last faulting EIP was at %s\n", my_reason->ip); - printk(KERN_EMERG - "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); - printk(KERN_EMERG - "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->si, regs->di, regs->bp, regs->sp); #else print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip); print_symbol(KERN_EMERG "last faulting RIP was at %s\n", my_reason->ip); - printk(KERN_EMERG "rax: %016lx rcx: %016lx rdx: %016lx\n", + pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", regs->ax, regs->cx, regs->dx); - printk(KERN_EMERG "rsi: %016lx rdi: %016lx " - "rbp: %016lx rsp: %016lx\n", + pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", regs->si, regs->di, regs->bp, regs->sp); #endif put_cpu_var(pf_reason); @@ -251,10 +246,15 @@ static void post(struct kmmio_probe *p, unsigned long condition, struct trap_reason *my_reason = &get_cpu_var(pf_reason); struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace); + /* + * XXX: This might not get called, if the probe is removed while + * trace hit is on flight. + */ + /* this should always return the active_trace count to 0 */ my_reason->active_traces--; if (my_reason->active_traces) { - printk(KERN_EMERG MODULE_NAME ": unexpected post handler"); + pr_emerg(MODULE_NAME ": unexpected post handler"); BUG(); } @@ -283,16 +283,15 @@ static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, atomic_t *drop = &per_cpu(dropped, cpu); int count; if (relay_buf_full(buf)) { - if (atomic_inc_return(drop) == 1) { - printk(KERN_ERR MODULE_NAME ": cpu %d buffer full!\n", - cpu); - } + if (atomic_inc_return(drop) == 1) + pr_err(MODULE_NAME ": cpu %d buffer full!\n", cpu); return 0; - } else if ((count = atomic_read(drop))) { - printk(KERN_ERR MODULE_NAME - ": cpu %d buffer no longer full, " - "missed %d events.\n", - cpu, count); + } + count = atomic_read(drop); + if (count) { + pr_err(MODULE_NAME ": cpu %d buffer no longer full, " + "missed %d events.\n", + cpu, count); atomic_sub(count, drop); } @@ -407,8 +406,8 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, /* Don't trace the low PCI/ISA area, it's always mapped.. */ if (!ISA_trace && (offset < ISA_END_ADDRESS) && (offset + size > ISA_START_ADDRESS)) { - printk(KERN_NOTICE MODULE_NAME ": Ignoring map of low " - "PCI/ISA area (0x%lx-0x%lx)\n", + pr_notice(MODULE_NAME ": Ignoring map of low PCI/ISA area " + "(0x%lx-0x%lx)\n", offset, offset + size); return; } @@ -418,7 +417,7 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size) { void __iomem *p = ioremap_cache(offset, size); - printk(KERN_DEBUG MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n", + pr_debug(MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n", offset, size, p); ioremap_trace_core(offset, size, p); return p; @@ -428,7 +427,7 @@ EXPORT_SYMBOL(ioremap_cache_trace); void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size) { void __iomem *p = ioremap_nocache(offset, size); - printk(KERN_DEBUG MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n", + pr_debug(MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n", offset, size, p); ioremap_trace_core(offset, size, p); return p; @@ -455,7 +454,7 @@ void iounmap_trace(volatile void __iomem *addr) }; struct remap_trace *trace; struct remap_trace *tmp; - printk(KERN_DEBUG MODULE_NAME ": Unmapping %p.\n", addr); + pr_debug(MODULE_NAME ": Unmapping %p.\n", addr); record_timestamp(&event.header); spin_lock(&trace_list_lock); @@ -481,7 +480,7 @@ static void clear_trace_list(void) spin_lock(&trace_list_lock); list_for_each_entry_safe(trace, tmp, &trace_list, list) { - printk(KERN_WARNING MODULE_NAME ": purging non-iounmapped " + pr_warning(MODULE_NAME ": purging non-iounmapped " "trace @0x%08lx, size 0x%lx.\n", trace->probe.addr, trace->probe.len); if (!nommiotrace) @@ -500,39 +499,37 @@ static int __init init(void) dir = debugfs_create_dir(APP_DIR, NULL); if (!dir) { - printk(KERN_ERR MODULE_NAME - ": Couldn't create relay app directory.\n"); + pr_err(MODULE_NAME ": Couldn't create relay app directory.\n"); return -ENOMEM; } chan = create_channel(subbuf_size, n_subbufs); if (!chan) { debugfs_remove(dir); - printk(KERN_ERR MODULE_NAME - ": relay app channel creation failed\n"); + pr_err(MODULE_NAME ": relay app channel creation failed\n"); return -ENOMEM; } - init_kmmio(); + reference_kmmio(); proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL); if (proc_marker_file) proc_marker_file->write_proc = write_marker; - printk(KERN_DEBUG MODULE_NAME ": loaded.\n"); + pr_debug(MODULE_NAME ": loaded.\n"); if (nommiotrace) - printk(KERN_DEBUG MODULE_NAME ": MMIO tracing disabled.\n"); + pr_info(MODULE_NAME ": MMIO tracing disabled.\n"); if (ISA_trace) - printk(KERN_WARNING MODULE_NAME - ": Warning! low ISA range will be traced.\n"); + pr_warning(MODULE_NAME ": Warning! low ISA range will be " + "traced.\n"); return 0; } static void __exit cleanup(void) { - printk(KERN_DEBUG MODULE_NAME ": unload...\n"); + pr_debug(MODULE_NAME ": unload...\n"); clear_trace_list(); - cleanup_kmmio(); + unreference_kmmio(); remove_proc_entry(MARKER_FILE, NULL); destroy_channel(); if (dir) diff --git a/arch/x86/kernel/mmiotrace/pf_in.c b/arch/x86/kernel/mmiotrace/pf_in.c index 67ea520dde6..efa1911e20c 100644 --- a/arch/x86/kernel/mmiotrace/pf_in.c +++ b/arch/x86/kernel/mmiotrace/pf_in.c @@ -19,7 +19,7 @@ * */ -/* $Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp $ +/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp * Copyright by Intel Crop., 2002 * Louis Zhuang (louis.zhuang@intel.com) * diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c index 40e66b0e648..5ecff578672 100644 --- a/arch/x86/kernel/mmiotrace/testmmiotrace.c +++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c @@ -41,8 +41,7 @@ static void do_test(void) { void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000); if (!p) { - printk(KERN_ERR MODULE_NAME ": could not ioremap IO memory, " - "aborting.\n"); + pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; } do_write_test(p); @@ -53,14 +52,14 @@ static void do_test(void) static int __init init(void) { if (mmio_address == 0) { - printk(KERN_ERR MODULE_NAME ": you have to use the module " - "argument mmio_address.\n"); - printk(KERN_ERR MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" + pr_err(MODULE_NAME ": you have to use the module argument " + "mmio_address.\n"); + pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" " YOU REALLY KNOW WHAT YOU ARE DOING!\n"); return -ENXIO; } - printk(KERN_WARNING MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " + pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " "in PCI address space, and writing " "rubbish in there.\n", mmio_address); do_test(); @@ -69,7 +68,7 @@ static int __init init(void) static void __exit cleanup(void) { - printk(KERN_DEBUG MODULE_NAME ": unloaded.\n"); + pr_debug(MODULE_NAME ": unloaded.\n"); } module_init(init); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e9a086a1a9f..8c828a68d3b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -49,60 +50,14 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) -#ifdef CONFIG_MMIOTRACE_HOOKS -static pf_handler_func mmiotrace_pf_handler; /* protected by RCU */ -static DEFINE_SPINLOCK(mmiotrace_handler_lock); - -int mmiotrace_register_pf(pf_handler_func new_pfh) -{ - int ret = 0; - unsigned long flags; - spin_lock_irqsave(&mmiotrace_handler_lock, flags); - if (mmiotrace_pf_handler) - ret = -EBUSY; - else - mmiotrace_pf_handler = new_pfh; - spin_unlock_irqrestore(&mmiotrace_handler_lock, flags); - return ret; -} -EXPORT_SYMBOL_GPL(mmiotrace_register_pf); - -/** - * mmiotrace_unregister_pf: - * The caller must ensure @old_pfh is not in use anymore before freeing it. - * This function does not guarantee it. The handler function pointer is - * protected by RCU, so you can do this by e.g. calling synchronize_rcu(). - */ -int mmiotrace_unregister_pf(pf_handler_func old_pfh) -{ - int ret = 0; - unsigned long flags; - spin_lock_irqsave(&mmiotrace_handler_lock, flags); - if (mmiotrace_pf_handler != old_pfh) - ret = -EPERM; - else - mmiotrace_pf_handler = NULL; - spin_unlock_irqrestore(&mmiotrace_handler_lock, flags); - return ret; -} -EXPORT_SYMBOL_GPL(mmiotrace_unregister_pf); -#endif /* CONFIG_MMIOTRACE_HOOKS */ - -/* returns non-zero if do_page_fault() should return */ -static inline int call_mmiotrace(struct pt_regs *regs, - unsigned long error_code, - unsigned long address) +static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { #ifdef CONFIG_MMIOTRACE_HOOKS - int ret = 0; - rcu_read_lock(); - if (mmiotrace_pf_handler) - ret = mmiotrace_pf_handler(regs, error_code, address); - rcu_read_unlock(); - return ret; -#else - return 0; + if (unlikely(is_kmmio_active())) + if (kmmio_handler(regs, addr) == 1) + return -1; #endif + return 0; } static inline int notify_page_fault(struct pt_regs *regs) @@ -657,7 +612,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (notify_page_fault(regs)) return; - if (call_mmiotrace(regs, error_code, address)) + if (unlikely(kmmio_fault(regs, address))) return; /* diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h index 7063281040d..96651bb59ba 100644 --- a/include/asm-x86/kdebug.h +++ b/include/asm-x86/kdebug.h @@ -35,11 +35,4 @@ extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); -typedef int (*pf_handler_func)(struct pt_regs *regs, - unsigned long error_code, - unsigned long address); - -extern int mmiotrace_register_pf(pf_handler_func new_pfh); -extern int mmiotrace_unregister_pf(pf_handler_func old_pfh); - #endif diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 6ec288f1fe2..d87a6cd8b68 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -3,6 +3,44 @@ #include +#ifdef __KERNEL__ + +#include + +struct kmmio_probe; +struct pt_regs; + +typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *, + struct pt_regs *, unsigned long addr); +typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, + unsigned long condition, struct pt_regs *); + +struct kmmio_probe { + struct list_head list; + unsigned long addr; /* start location of the probe point */ + unsigned long len; /* length of the probe region */ + kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ + kmmio_post_handler_t post_handler; /* Called after addr is executed */ +}; + +/* kmmio is active by some kmmio_probes? */ +static inline int is_kmmio_active(void) +{ + extern unsigned int kmmio_count; + return kmmio_count; +} + +extern void reference_kmmio(void); +extern void unreference_kmmio(void); +extern int register_kmmio_probe(struct kmmio_probe *p); +extern void unregister_kmmio_probe(struct kmmio_probe *p); + +/* Called from page fault handler. */ +extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); + +#endif /* __KERNEL__ */ + + /* * If you change anything here, you must bump MMIO_VERSION. * This is the relay data format for user space. -- cgit v1.2.3-70-g09d2 From d61fc44853f46fb002228b18aa5f30db21fcd4ac Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: x86: mmiotrace, preview 2 Kconfig.debug, Makefile and testmmiotrace.c style fixes. Use real mutex instead of mutex. Fix failure path in register probe func. kmmio: RCU read-locked over single stepping. Generate mapping id's. Make mmio-mod.c built-in and rewrite its locking. Add debugfs file to enable/disable mmiotracing. kmmio: use irqsave spinlocks. Lots of cleanups in mmio-mod.c Marker file moved from /proc into debugfs. Call mmiotrace entrypoints directly from ioremap.c. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig.debug | 20 +- arch/x86/kernel/mmiotrace/Makefile | 2 +- arch/x86/kernel/mmiotrace/kmmio.c | 72 +++--- arch/x86/kernel/mmiotrace/mmio-mod.c | 397 ++++++++++++++++++++---------- arch/x86/kernel/mmiotrace/testmmiotrace.c | 15 +- arch/x86/mm/ioremap.c | 9 +- include/linux/mmiotrace.h | 18 +- 7 files changed, 332 insertions(+), 201 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 9491c0ae03a..aa0d6462b1f 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -170,22 +170,19 @@ config IOMMU_LEAK config MMIOTRACE_HOOKS bool - default n config MMIOTRACE - tristate "Memory mapped IO tracing" + bool "Memory mapped IO tracing" depends on DEBUG_KERNEL && RELAY && DEBUG_FS select MMIOTRACE_HOOKS - default n + default y help - This will build a kernel module called mmiotrace. - Making this a built-in is heavily discouraged. - - Mmiotrace traces Memory Mapped I/O access and is meant for debugging - and reverse engineering. The kernel module offers wrapped - versions of the ioremap family of functions. The driver to be traced - must be modified to call these wrappers. A user space program is - required to collect the MMIO data. + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. A user space program is + required to collect the MMIO data from debugfs files. + Tracing is disabled by default and can be enabled from a debugfs + file. See http://nouveau.freedesktop.org/wiki/MmioTrace If you are not helping to develop drivers, say N. @@ -193,7 +190,6 @@ config MMIOTRACE config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m - default n help This is a dumb module for testing mmiotrace. It is very dangerous as it will write garbage to IO memory starting at a given address. diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile index cf1e747b463..dbcd8d50fb8 100644 --- a/arch/x86/kernel/mmiotrace/Makefile +++ b/arch/x86/kernel/mmiotrace/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o obj-$(CONFIG_MMIOTRACE) += mmiotrace.o -mmiotrace-objs := pf_in.o mmio-mod.o +mmiotrace-y := pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c index 539a9b19588..efb46793308 100644 --- a/arch/x86/kernel/mmiotrace/kmmio.c +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -59,7 +60,7 @@ struct kmmio_context { static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args); -static DECLARE_MUTEX(kmmio_init_mutex); +static DEFINE_MUTEX(kmmio_init_mutex); static DEFINE_SPINLOCK(kmmio_lock); /* These are protected by kmmio_lock */ @@ -90,7 +91,7 @@ static struct notifier_block nb_die = { */ void reference_kmmio(void) { - down(&kmmio_init_mutex); + mutex_lock(&kmmio_init_mutex); spin_lock_irq(&kmmio_lock); if (!kmmio_initialized) { int i; @@ -101,7 +102,7 @@ void reference_kmmio(void) } kmmio_initialized++; spin_unlock_irq(&kmmio_lock); - up(&kmmio_init_mutex); + mutex_unlock(&kmmio_init_mutex); } EXPORT_SYMBOL_GPL(reference_kmmio); @@ -115,7 +116,7 @@ void unreference_kmmio(void) { bool unreg = false; - down(&kmmio_init_mutex); + mutex_lock(&kmmio_init_mutex); spin_lock_irq(&kmmio_lock); if (kmmio_initialized == 1) { @@ -128,7 +129,7 @@ void unreference_kmmio(void) if (unreg) unregister_die_notifier(&nb_die); /* calls sync_rcu() */ - up(&kmmio_init_mutex); + mutex_unlock(&kmmio_init_mutex); } EXPORT_SYMBOL(unreference_kmmio); @@ -244,17 +245,13 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) * Preemption is now disabled to prevent process switch during * single stepping. We can only handle one active kmmio trace * per cpu, so ensure that we finish it before something else - * gets to run. - * - * XXX what if an interrupt occurs between returning from - * do_page_fault() and entering the single-step exception handler? - * And that interrupt triggers a kmmio trap? - * XXX If we tracing an interrupt service routine or whatever, is - * this enough to keep it on the current cpu? + * gets to run. We also hold the RCU read lock over single + * stepping to avoid looking up the probe and kmmio_fault_page + * again. */ preempt_disable(); - rcu_read_lock(); + faultpage = get_kmmio_fault_page(addr); if (!faultpage) { /* @@ -287,14 +284,24 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) if (ctx->probe && ctx->probe->pre_handler) ctx->probe->pre_handler(ctx->probe, regs, addr); + /* + * Enable single-stepping and disable interrupts for the faulting + * context. Local interrupts must not get enabled during stepping. + */ regs->flags |= TF_MASK; regs->flags &= ~IF_MASK; /* Now we set present bit in PTE and single step. */ disarm_kmmio_fault_page(ctx->fpage->page, NULL); + /* + * If another cpu accesses the same page while we are stepping, + * the access will not be caught. It will simply succeed and the + * only downside is we lose the event. If this becomes a problem, + * the user should drop to single cpu before tracing. + */ + put_cpu_var(kmmio_ctx); - rcu_read_unlock(); return 1; no_kmmio_ctx: @@ -313,32 +320,15 @@ no_kmmio: static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) { int ret = 0; - struct kmmio_probe *probe; - struct kmmio_fault_page *faultpage; struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) goto out; - rcu_read_lock(); - - faultpage = get_kmmio_fault_page(ctx->addr); - probe = get_kmmio_probe(ctx->addr); - if (faultpage != ctx->fpage || probe != ctx->probe) { - /* - * The trace setup changed after kmmio_handler() and before - * running this respective post handler. User does not want - * the result anymore. - */ - ctx->probe = NULL; - ctx->fpage = NULL; - } - if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); - if (ctx->fpage) - arm_kmmio_fault_page(ctx->fpage->page, NULL); + arm_kmmio_fault_page(ctx->fpage->page, NULL); regs->flags &= ~TF_MASK; regs->flags |= ctx->saved_flags; @@ -346,6 +336,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) /* These were acquired in kmmio_handler(). */ ctx->active--; BUG_ON(ctx->active); + rcu_read_unlock(); preempt_enable_no_resched(); /* @@ -355,8 +346,6 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) */ if (!(regs->flags & TF_MASK)) ret = 1; - - rcu_read_unlock(); out: put_cpu_var(kmmio_ctx); return ret; @@ -411,15 +400,16 @@ static void release_kmmio_fault_page(unsigned long page, int register_kmmio_probe(struct kmmio_probe *p) { + unsigned long flags; int ret = 0; unsigned long size = 0; - spin_lock_irq(&kmmio_lock); - kmmio_count++; + spin_lock_irqsave(&kmmio_lock, flags); if (get_kmmio_probe(p->addr)) { ret = -EEXIST; goto out; } + kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < p->len) { if (add_kmmio_fault_page(p->addr + size)) @@ -427,7 +417,7 @@ int register_kmmio_probe(struct kmmio_probe *p) size += PAGE_SIZE; } out: - spin_unlock_irq(&kmmio_lock); + spin_unlock_irqrestore(&kmmio_lock, flags); /* * XXX: What should I do here? * Here was a call to global_flush_tlb(), but it does not exist @@ -478,7 +468,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) /* * Remove a kmmio probe. You have to synchronize_rcu() before you can be - * sure that the callbacks will not be called anymore. + * sure that the callbacks will not be called anymore. Only after that + * you may actually release your struct kmmio_probe. * * Unregistering a kmmio fault page has three steps: * 1. release_kmmio_fault_page() @@ -490,18 +481,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) */ void unregister_kmmio_probe(struct kmmio_probe *p) { + unsigned long flags; unsigned long size = 0; struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; - spin_lock_irq(&kmmio_lock); + spin_lock_irqsave(&kmmio_lock, flags); while (size < p->len) { release_kmmio_fault_page(p->addr + size, &release_list); size += PAGE_SIZE; } list_del_rcu(&p->list); kmmio_count--; - spin_unlock_irq(&kmmio_lock); + spin_unlock_irqrestore(&kmmio_lock, flags); drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); if (!drelease) { diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index e1a508588f0..738644061e4 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -19,6 +19,8 @@ * * Derived from the read-mod example from relay-examples by Tom Zanussi. */ +#define DEBUG 1 + #include #include #include @@ -34,12 +36,12 @@ #include "pf_in.h" -/* This app's relay channel files will appear in /debug/mmio-trace */ -#define APP_DIR "mmio-trace" -/* the marker injection file in /proc */ -#define MARKER_FILE "mmio-marker" +#define NAME "mmiotrace: " -#define MODULE_NAME "mmiotrace" +/* This app's relay channel files will appear in /debug/mmio-trace */ +static const char APP_DIR[] = "mmio-trace"; +/* the marker injection file in /debug/APP_DIR */ +static const char MARKER_FILE[] = "mmio-marker"; struct trap_reason { unsigned long addr; @@ -48,6 +50,15 @@ struct trap_reason { int active_traces; }; +struct remap_trace { + struct list_head list; + struct kmmio_probe probe; + unsigned long phys; + unsigned long id; +}; + +static const size_t subbuf_size = 256*1024; + /* Accessed per-cpu. */ static DEFINE_PER_CPU(struct trap_reason, pf_reason); static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace); @@ -55,33 +66,53 @@ static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace); /* Access to this is not per-cpu. */ static DEFINE_PER_CPU(atomic_t, dropped); -static struct file_operations mmio_fops = { - .owner = THIS_MODULE, -}; +static struct dentry *dir; +static struct dentry *enabled_file; +static struct dentry *marker_file; -static const size_t subbuf_size = 256*1024; +static DEFINE_MUTEX(mmiotrace_mutex); +static DEFINE_SPINLOCK(trace_lock); +static atomic_t mmiotrace_enabled; +static LIST_HEAD(trace_list); /* struct remap_trace */ static struct rchan *chan; -static struct dentry *dir; -static struct proc_dir_entry *proc_marker_file; + +/* + * Locking in this file: + * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections. + * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex + * and trace_lock. + * - Routines depending on is_enabled() must take trace_lock. + * - trace_list users must hold trace_lock. + * - is_enabled() guarantees that chan is valid. + * - pre/post callbacks assume the effect of is_enabled() being true. + */ /* module parameters */ -static unsigned int n_subbufs = 32*4; -static unsigned long filter_offset; -static int nommiotrace; -static int ISA_trace; -static int trace_pc; +static unsigned int n_subbufs = 32*4; +static unsigned long filter_offset; +static int nommiotrace; +static int ISA_trace; +static int trace_pc; +static int enable_now; module_param(n_subbufs, uint, 0); module_param(filter_offset, ulong, 0); module_param(nommiotrace, bool, 0); module_param(ISA_trace, bool, 0); module_param(trace_pc, bool, 0); +module_param(enable_now, bool, 0); MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128."); MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range."); MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); +MODULE_PARM_DESC(enable_now, "Start mmiotracing immediately on module load."); + +static bool is_enabled(void) +{ + return atomic_read(&mmiotrace_enabled); +} static void record_timestamp(struct mm_io_header *header) { @@ -93,15 +124,15 @@ static void record_timestamp(struct mm_io_header *header) } /* - * Write callback for the /proc entry: + * Write callback for the debugfs entry: * Read a marker and write it to the mmio trace log */ -static int write_marker(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static ssize_t write_marker(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) { char *event = NULL; struct mm_io_header *headp; - int len = (count > 65535) ? 65535 : count; + ssize_t len = (count > 65535) ? 65535 : count; event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); if (!event) @@ -117,7 +148,12 @@ static int write_marker(struct file *file, const char __user *buffer, return -EFAULT; } - relay_write(chan, event, sizeof(*headp) + len); + spin_lock_irq(&trace_lock); + if (is_enabled()) + relay_write(chan, event, sizeof(*headp) + len); + else + len = -EINVAL; + spin_unlock_irq(&trace_lock); kfree(event); return len; } @@ -128,19 +164,18 @@ static void print_pte(unsigned long address) pte_t *pte = lookup_address(address, &level); if (!pte) { - pr_err(MODULE_NAME ": Error in %s: no pte for page 0x%08lx\n", + pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", __func__, address); return; } if (level == PG_LEVEL_2M) { - pr_emerg(MODULE_NAME ": 4MB pages are not currently " - "supported: %lx\n", address); + pr_emerg(NAME "4MB pages are not currently supported: " + "0x%08lx\n", address); BUG(); } - pr_info(MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n", - address, pte_val(*pte), - pte_val(*pte) & _PAGE_PRESENT); + pr_info(NAME "pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte), + pte_val(*pte) & _PAGE_PRESENT); } /* @@ -150,22 +185,18 @@ static void print_pte(unsigned long address) static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) { const struct trap_reason *my_reason = &get_cpu_var(pf_reason); - pr_emerg(MODULE_NAME ": unexpected fault for address: %lx, " - "last fault for address: %lx\n", + pr_emerg(NAME "unexpected fault for address: 0x%08lx, " + "last fault for address: 0x%08lx\n", addr, my_reason->addr); print_pte(addr); + print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); + print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); #ifdef __i386__ - print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip); - print_symbol(KERN_EMERG "last faulting EIP was at %s\n", - my_reason->ip); pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->si, regs->di, regs->bp, regs->sp); #else - print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip); - print_symbol(KERN_EMERG "last faulting RIP was at %s\n", - my_reason->ip); pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", regs->ax, regs->cx, regs->dx); pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", @@ -197,6 +228,10 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, my_trace->header.pid = 0; my_trace->header.data_len = sizeof(struct mm_io_rw); my_trace->rw.address = addr; + /* + * struct remap_trace *trace = p->user_data; + * phys = addr - trace->probe.addr + trace->phys; + */ /* * Only record the program counter when requested. @@ -246,15 +281,10 @@ static void post(struct kmmio_probe *p, unsigned long condition, struct trap_reason *my_reason = &get_cpu_var(pf_reason); struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace); - /* - * XXX: This might not get called, if the probe is removed while - * trace hit is on flight. - */ - /* this should always return the active_trace count to 0 */ my_reason->active_traces--; if (my_reason->active_traces) { - pr_emerg(MODULE_NAME ": unexpected post handler"); + pr_emerg(NAME "unexpected post handler"); BUG(); } @@ -284,20 +314,23 @@ static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, int count; if (relay_buf_full(buf)) { if (atomic_inc_return(drop) == 1) - pr_err(MODULE_NAME ": cpu %d buffer full!\n", cpu); + pr_err(NAME "cpu %d buffer full!\n", cpu); return 0; } count = atomic_read(drop); if (count) { - pr_err(MODULE_NAME ": cpu %d buffer no longer full, " - "missed %d events.\n", - cpu, count); + pr_err(NAME "cpu %d buffer no longer full, missed %d events.\n", + cpu, count); atomic_sub(count, drop); } return 1; } +static struct file_operations mmio_fops = { + .owner = THIS_MODULE, +}; + /* file_create() callback. Creates relay file in debugfs. */ static struct dentry *create_buf_file_handler(const char *filename, struct dentry *parent, @@ -333,34 +366,10 @@ static struct rchan_callbacks relay_callbacks = { .remove_buf_file = remove_buf_file_handler, }; -/* - * create_channel - creates channel /debug/APP_DIR/cpuXXX - * Returns channel on success, NULL otherwise - */ -static struct rchan *create_channel(unsigned size, unsigned n) -{ - return relay_open("cpu", dir, size, n, &relay_callbacks, NULL); -} - -/* destroy_channel - destroys channel /debug/APP_DIR/cpuXXX */ -static void destroy_channel(void) -{ - if (chan) { - relay_close(chan); - chan = NULL; - } -} - -struct remap_trace { - struct list_head list; - struct kmmio_probe probe; -}; -static LIST_HEAD(trace_list); -static DEFINE_SPINLOCK(trace_list_lock); - -static void do_ioremap_trace_core(unsigned long offset, unsigned long size, +static void ioremap_trace_core(unsigned long offset, unsigned long size, void __iomem *addr) { + static atomic_t next_id; struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); struct mm_io_header_map event = { .header = { @@ -380,61 +389,49 @@ static void do_ioremap_trace_core(unsigned long offset, unsigned long size, }; record_timestamp(&event.header); + if (!trace) { + pr_err(NAME "kmalloc failed in ioremap\n"); + return; + } + *trace = (struct remap_trace) { .probe = { .addr = (unsigned long)addr, .len = size, .pre_handler = pre, .post_handler = post, - } + .user_data = trace + }, + .phys = offset, + .id = atomic_inc_return(&next_id) }; + spin_lock_irq(&trace_lock); + if (!is_enabled()) + goto not_enabled; + relay_write(chan, &event, sizeof(event)); - spin_lock(&trace_list_lock); list_add_tail(&trace->list, &trace_list); - spin_unlock(&trace_list_lock); if (!nommiotrace) register_kmmio_probe(&trace->probe); + +not_enabled: + spin_unlock_irq(&trace_lock); } -static void ioremap_trace_core(unsigned long offset, unsigned long size, - void __iomem *addr) +void +mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) { - if ((filter_offset) && (offset != filter_offset)) + if (!is_enabled()) /* recheck and proper locking in *_core() */ return; - /* Don't trace the low PCI/ISA area, it's always mapped.. */ - if (!ISA_trace && (offset < ISA_END_ADDRESS) && - (offset + size > ISA_START_ADDRESS)) { - pr_notice(MODULE_NAME ": Ignoring map of low PCI/ISA area " - "(0x%lx-0x%lx)\n", - offset, offset + size); + pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr); + if ((filter_offset) && (offset != filter_offset)) return; - } - do_ioremap_trace_core(offset, size, addr); -} - -void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size) -{ - void __iomem *p = ioremap_cache(offset, size); - pr_debug(MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n", - offset, size, p); - ioremap_trace_core(offset, size, p); - return p; + ioremap_trace_core(offset, size, addr); } -EXPORT_SYMBOL(ioremap_cache_trace); -void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size) -{ - void __iomem *p = ioremap_nocache(offset, size); - pr_debug(MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n", - offset, size, p); - ioremap_trace_core(offset, size, p); - return p; -} -EXPORT_SYMBOL(ioremap_nocache_trace); - -void iounmap_trace(volatile void __iomem *addr) +static void iounmap_trace_core(volatile void __iomem *addr) { struct mm_io_header_map event = { .header = { @@ -454,84 +451,212 @@ void iounmap_trace(volatile void __iomem *addr) }; struct remap_trace *trace; struct remap_trace *tmp; - pr_debug(MODULE_NAME ": Unmapping %p.\n", addr); + struct remap_trace *found_trace = NULL; + + pr_debug(NAME "Unmapping %p.\n", addr); record_timestamp(&event.header); - spin_lock(&trace_list_lock); + spin_lock_irq(&trace_lock); + if (!is_enabled()) + goto not_enabled; + list_for_each_entry_safe(trace, tmp, &trace_list, list) { if ((unsigned long)addr == trace->probe.addr) { if (!nommiotrace) unregister_kmmio_probe(&trace->probe); list_del(&trace->list); - kfree(trace); + found_trace = trace; break; } } - spin_unlock(&trace_list_lock); relay_write(chan, &event, sizeof(event)); - iounmap(addr); + +not_enabled: + spin_unlock_irq(&trace_lock); + if (found_trace) { + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + kfree(found_trace); + } +} + +void mmiotrace_iounmap(volatile void __iomem *addr) +{ + might_sleep(); + if (is_enabled()) /* recheck and proper locking in *_core() */ + iounmap_trace_core(addr); } -EXPORT_SYMBOL(iounmap_trace); static void clear_trace_list(void) { struct remap_trace *trace; struct remap_trace *tmp; - spin_lock(&trace_list_lock); - list_for_each_entry_safe(trace, tmp, &trace_list, list) { - pr_warning(MODULE_NAME ": purging non-iounmapped " + /* + * No locking required, because the caller ensures we are in a + * critical section via mutex, and is_enabled() is false, + * i.e. nothing can traverse or modify this list. + * Caller also ensures is_enabled() cannot change. + */ + list_for_each_entry(trace, &trace_list, list) { + pr_notice(NAME "purging non-iounmapped " "trace @0x%08lx, size 0x%lx.\n", trace->probe.addr, trace->probe.len); if (!nommiotrace) unregister_kmmio_probe(&trace->probe); + } + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { list_del(&trace->list); kfree(trace); + } +} + +static ssize_t read_enabled_file_bool(struct file *file, + char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[3]; + + if (is_enabled()) + buf[0] = '1'; + else + buf[0] = '0'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static void enable_mmiotrace(void); +static void disable_mmiotrace(void); + +static ssize_t write_enabled_file_bool(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + int buf_size = min(count, (sizeof(buf)-1)); + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + switch (buf[0]) { + case 'y': + case 'Y': + case '1': + enable_mmiotrace(); + break; + case 'n': + case 'N': + case '0': + disable_mmiotrace(); break; } - spin_unlock(&trace_list_lock); + + return count; +} + +/* this ripped from kernel/kprobes.c */ +static struct file_operations fops_enabled = { + .owner = THIS_MODULE, + .read = read_enabled_file_bool, + .write = write_enabled_file_bool +}; + +static struct file_operations fops_marker = { + .owner = THIS_MODULE, + .write = write_marker +}; + +static void enable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (is_enabled()) + goto out; + + chan = relay_open("cpu", dir, subbuf_size, n_subbufs, + &relay_callbacks, NULL); + if (!chan) { + pr_err(NAME "relay app channel creation failed.\n"); + goto out; + } + + reference_kmmio(); + + marker_file = debugfs_create_file("marker", 0660, dir, NULL, + &fops_marker); + if (!marker_file) + pr_err(NAME "marker file creation failed.\n"); + + if (nommiotrace) + pr_info(NAME "MMIO tracing disabled.\n"); + if (ISA_trace) + pr_warning(NAME "Warning! low ISA range will be traced.\n"); + spin_lock_irq(&trace_lock); + atomic_inc(&mmiotrace_enabled); + spin_unlock_irq(&trace_lock); + pr_info(NAME "enabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} + +static void disable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (!is_enabled()) + goto out; + + spin_lock_irq(&trace_lock); + atomic_dec(&mmiotrace_enabled); + BUG_ON(is_enabled()); + spin_unlock_irq(&trace_lock); + + clear_trace_list(); /* guarantees: no more kmmio callbacks */ + unreference_kmmio(); + if (marker_file) { + debugfs_remove(marker_file); + marker_file = NULL; + } + if (chan) { + relay_close(chan); + chan = NULL; + } + + pr_info(NAME "disabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); } static int __init init(void) { + pr_debug(NAME "load...\n"); if (n_subbufs < 2) return -EINVAL; dir = debugfs_create_dir(APP_DIR, NULL); if (!dir) { - pr_err(MODULE_NAME ": Couldn't create relay app directory.\n"); + pr_err(NAME "Couldn't create relay app directory.\n"); return -ENOMEM; } - chan = create_channel(subbuf_size, n_subbufs); - if (!chan) { + enabled_file = debugfs_create_file("enabled", 0600, dir, NULL, + &fops_enabled); + if (!enabled_file) { + pr_err(NAME "Couldn't create enabled file.\n"); debugfs_remove(dir); - pr_err(MODULE_NAME ": relay app channel creation failed\n"); return -ENOMEM; } - reference_kmmio(); - - proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL); - if (proc_marker_file) - proc_marker_file->write_proc = write_marker; + if (enable_now) + enable_mmiotrace(); - pr_debug(MODULE_NAME ": loaded.\n"); - if (nommiotrace) - pr_info(MODULE_NAME ": MMIO tracing disabled.\n"); - if (ISA_trace) - pr_warning(MODULE_NAME ": Warning! low ISA range will be " - "traced.\n"); return 0; } static void __exit cleanup(void) { - pr_debug(MODULE_NAME ": unload...\n"); - clear_trace_list(); - unreference_kmmio(); - remove_proc_entry(MARKER_FILE, NULL); - destroy_channel(); + pr_debug(NAME "unload...\n"); + if (enabled_file) + debugfs_remove(enabled_file); + disable_mmiotrace(); if (dir) debugfs_remove(dir); } diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c index 5ecff578672..cfa60b227c8 100644 --- a/arch/x86/kernel/mmiotrace/testmmiotrace.c +++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c @@ -4,10 +4,6 @@ #include #include -extern void __iomem *ioremap_nocache_trace(unsigned long offset, - unsigned long size); -extern void iounmap_trace(volatile void __iomem *addr); - #define MODULE_NAME "testmmiotrace" static unsigned long mmio_address; @@ -28,25 +24,24 @@ static void do_write_test(void __iomem *p) static void do_read_test(void __iomem *p) { unsigned int i; - volatile unsigned int v; for (i = 0; i < 256; i++) - v = ioread8(p + i); + ioread8(p + i); for (i = 1024; i < (5 * 1024); i += 2) - v = ioread16(p + i); + ioread16(p + i); for (i = (5 * 1024); i < (16 * 1024); i += 4) - v = ioread32(p + i); + ioread32(p + i); } static void do_test(void) { - void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000); + void __iomem *p = ioremap_nocache(mmio_address, 0x4000); if (!p) { pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; } do_write_test(p); do_read_test(p); - iounmap_trace(p); + iounmap(p); } static int __init init(void) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 71bb3159031..8927c878544 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -126,6 +127,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, unsigned long new_prot_val; pgprot_t prot; int retval; + void __iomem *ret_addr; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; @@ -233,7 +235,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; } - return (void __iomem *) (vaddr + offset); + ret_addr = (void __iomem *) (vaddr + offset); + mmiotrace_ioremap(phys_addr, size, ret_addr); + + return ret_addr; } /** @@ -325,6 +330,8 @@ void iounmap(volatile void __iomem *addr) addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); + mmiotrace_iounmap(addr); + /* Use the vm area unlocked, assuming the caller ensures there isn't another iounmap for the same address in parallel. Reuse of the virtual address is prevented by diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index d87a6cd8b68..cb5efd0c7f5 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -16,11 +16,12 @@ typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, unsigned long condition, struct pt_regs *); struct kmmio_probe { - struct list_head list; + struct list_head list; /* kmmio internal list */ unsigned long addr; /* start location of the probe point */ unsigned long len; /* length of the probe region */ kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ kmmio_post_handler_t post_handler; /* Called after addr is executed */ + void *user_data; }; /* kmmio is active by some kmmio_probes? */ @@ -38,6 +39,21 @@ extern void unregister_kmmio_probe(struct kmmio_probe *p); /* Called from page fault handler. */ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); +/* Called from ioremap.c */ +#ifdef CONFIG_MMIOTRACE +extern void +mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr); +extern void mmiotrace_iounmap(volatile void __iomem *addr); +#else +static inline void +mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) +{ +} +static inline void mmiotrace_iounmap(volatile void __iomem *addr) +{ +} +#endif /* CONFIG_MMIOTRACE_HOOKS */ + #endif /* __KERNEL__ */ -- cgit v1.2.3-70-g09d2 From f984b51e0779a6dd30feedc41404013ca54e5d05 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: ftrace: add mmiotrace plugin On Sat, 22 Mar 2008 13:07:47 +0100 Ingo Molnar wrote: > > > i'd suggest the following: pull x86.git and sched-devel.git into a > > > single tree [the two will combine without rejects]. Then try to add a > > > kernel/tracing/trace_mmiotrace.c ftrace plugin. The trace_sysprof.c > > > plugin might be a good example. > > > > I did this and now I have mmiotrace enabled/disabled via the tracing > > framework (what do we call this, since ftrace is one of the tracers?). > > cool! could you send the patches for that? (even if they are not fully > functional yet) Patch attached in the end. Nice to see how much code disappeared. I tried to mark all the features I had to break with XXX-comments. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig.debug | 3 +- arch/x86/kernel/mmiotrace/mmio-mod.c | 208 +++++------------------------------ include/linux/mmiotrace.h | 6 + kernel/trace/Makefile | 1 + kernel/trace/trace_mmiotrace.c | 84 ++++++++++++++ 5 files changed, 123 insertions(+), 179 deletions(-) create mode 100644 kernel/trace/trace_mmiotrace.c (limited to 'include/linux') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index aa0d6462b1f..7e4b8494078 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -173,7 +173,8 @@ config MMIOTRACE_HOOKS config MMIOTRACE bool "Memory mapped IO tracing" - depends on DEBUG_KERNEL && RELAY && DEBUG_FS + depends on DEBUG_KERNEL && RELAY + select TRACING select MMIOTRACE_HOOKS default y help diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index 738644061e4..c7a67d7e482 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -22,9 +22,8 @@ #define DEBUG 1 #include -#include #include -#include +#include #include #include #include @@ -63,18 +62,18 @@ static const size_t subbuf_size = 256*1024; static DEFINE_PER_CPU(struct trap_reason, pf_reason); static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace); +#if 0 /* XXX: no way gather this info anymore */ /* Access to this is not per-cpu. */ static DEFINE_PER_CPU(atomic_t, dropped); +#endif static struct dentry *dir; -static struct dentry *enabled_file; static struct dentry *marker_file; static DEFINE_MUTEX(mmiotrace_mutex); static DEFINE_SPINLOCK(trace_lock); static atomic_t mmiotrace_enabled; static LIST_HEAD(trace_list); /* struct remap_trace */ -static struct rchan *chan; /* * Locking in this file: @@ -93,36 +92,24 @@ static unsigned long filter_offset; static int nommiotrace; static int ISA_trace; static int trace_pc; -static int enable_now; module_param(n_subbufs, uint, 0); module_param(filter_offset, ulong, 0); module_param(nommiotrace, bool, 0); module_param(ISA_trace, bool, 0); module_param(trace_pc, bool, 0); -module_param(enable_now, bool, 0); MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128."); MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range."); MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); -MODULE_PARM_DESC(enable_now, "Start mmiotracing immediately on module load."); static bool is_enabled(void) { return atomic_read(&mmiotrace_enabled); } -static void record_timestamp(struct mm_io_header *header) -{ - struct timespec now; - - getnstimeofday(&now); - header->sec = now.tv_sec; - header->nsec = now.tv_nsec; -} - /* * Write callback for the debugfs entry: * Read a marker and write it to the mmio trace log @@ -141,7 +128,6 @@ static ssize_t write_marker(struct file *file, const char __user *buffer, headp = (struct mm_io_header *)event; headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); headp->data_len = len; - record_timestamp(headp); if (copy_from_user(event + sizeof(*headp), buffer, len)) { kfree(event); @@ -149,9 +135,11 @@ static ssize_t write_marker(struct file *file, const char __user *buffer, } spin_lock_irq(&trace_lock); +#if 0 /* XXX: convert this to use tracing */ if (is_enabled()) relay_write(chan, event, sizeof(*headp) + len); else +#endif len = -EINVAL; spin_unlock_irq(&trace_lock); kfree(event); @@ -242,7 +230,11 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, else my_trace->rw.pc = 0; - record_timestamp(&my_trace->header); + /* + * XXX: the timestamp recorded will be *after* the tracing has been + * done, not at the time we hit the instruction. SMP implications + * on event ordering? + */ switch (type) { case REG_READ: @@ -295,77 +287,19 @@ static void post(struct kmmio_probe *p, unsigned long condition, default: break; } - relay_write(chan, my_trace, sizeof(*my_trace)); + + /* + * XXX: Several required values are ignored: + * - mapping id + * - program counter + * Also the address should be physical, not virtual. + */ + mmio_trace_record(my_trace->header.type, my_trace->rw.address, + my_trace->rw.value); put_cpu_var(cpu_trace); put_cpu_var(pf_reason); } -/* - * subbuf_start() relay callback. - * - * Defined so that we know when events are dropped due to the buffer-full - * condition. - */ -static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - unsigned int cpu = buf->cpu; - atomic_t *drop = &per_cpu(dropped, cpu); - int count; - if (relay_buf_full(buf)) { - if (atomic_inc_return(drop) == 1) - pr_err(NAME "cpu %d buffer full!\n", cpu); - return 0; - } - count = atomic_read(drop); - if (count) { - pr_err(NAME "cpu %d buffer no longer full, missed %d events.\n", - cpu, count); - atomic_sub(count, drop); - } - - return 1; -} - -static struct file_operations mmio_fops = { - .owner = THIS_MODULE, -}; - -/* file_create() callback. Creates relay file in debugfs. */ -static struct dentry *create_buf_file_handler(const char *filename, - struct dentry *parent, - int mode, - struct rchan_buf *buf, - int *is_global) -{ - struct dentry *buf_file; - - mmio_fops.read = relay_file_operations.read; - mmio_fops.open = relay_file_operations.open; - mmio_fops.poll = relay_file_operations.poll; - mmio_fops.mmap = relay_file_operations.mmap; - mmio_fops.release = relay_file_operations.release; - mmio_fops.splice_read = relay_file_operations.splice_read; - - buf_file = debugfs_create_file(filename, mode, parent, buf, - &mmio_fops); - - return buf_file; -} - -/* file_remove() default callback. Removes relay file in debugfs. */ -static int remove_buf_file_handler(struct dentry *dentry) -{ - debugfs_remove(dentry); - return 0; -} - -static struct rchan_callbacks relay_callbacks = { - .subbuf_start = subbuf_start_handler, - .create_buf_file = create_buf_file_handler, - .remove_buf_file = remove_buf_file_handler, -}; - static void ioremap_trace_core(unsigned long offset, unsigned long size, void __iomem *addr) { @@ -387,7 +321,6 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, .pc = 0 } }; - record_timestamp(&event.header); if (!trace) { pr_err(NAME "kmalloc failed in ioremap\n"); @@ -410,7 +343,10 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, if (!is_enabled()) goto not_enabled; - relay_write(chan, &event, sizeof(event)); + /* + * XXX: Insufficient data recorded! + */ + mmio_trace_record(event.header.type, event.map.addr, event.map.len); list_add_tail(&trace->list, &trace_list); if (!nommiotrace) register_kmmio_probe(&trace->probe); @@ -454,7 +390,6 @@ static void iounmap_trace_core(volatile void __iomem *addr) struct remap_trace *found_trace = NULL; pr_debug(NAME "Unmapping %p.\n", addr); - record_timestamp(&event.header); spin_lock_irq(&trace_lock); if (!is_enabled()) @@ -469,7 +404,8 @@ static void iounmap_trace_core(volatile void __iomem *addr) break; } } - relay_write(chan, &event, sizeof(event)); + mmio_trace_record(event.header.type, event.map.addr, + found_trace ? found_trace->id : -1); not_enabled: spin_unlock_irq(&trace_lock); @@ -512,77 +448,23 @@ static void clear_trace_list(void) } } -static ssize_t read_enabled_file_bool(struct file *file, - char __user *user_buf, size_t count, loff_t *ppos) -{ - char buf[3]; - - if (is_enabled()) - buf[0] = '1'; - else - buf[0] = '0'; - buf[1] = '\n'; - buf[2] = '\0'; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static void enable_mmiotrace(void); -static void disable_mmiotrace(void); - -static ssize_t write_enabled_file_bool(struct file *file, - const char __user *user_buf, size_t count, loff_t *ppos) -{ - char buf[32]; - int buf_size = min(count, (sizeof(buf)-1)); - - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - switch (buf[0]) { - case 'y': - case 'Y': - case '1': - enable_mmiotrace(); - break; - case 'n': - case 'N': - case '0': - disable_mmiotrace(); - break; - } - - return count; -} - -/* this ripped from kernel/kprobes.c */ -static struct file_operations fops_enabled = { - .owner = THIS_MODULE, - .read = read_enabled_file_bool, - .write = write_enabled_file_bool -}; - static struct file_operations fops_marker = { .owner = THIS_MODULE, .write = write_marker }; -static void enable_mmiotrace(void) +void enable_mmiotrace(void) { mutex_lock(&mmiotrace_mutex); if (is_enabled()) goto out; - chan = relay_open("cpu", dir, subbuf_size, n_subbufs, - &relay_callbacks, NULL); - if (!chan) { - pr_err(NAME "relay app channel creation failed.\n"); - goto out; - } - reference_kmmio(); +#if 0 /* XXX: tracing does not support text entries */ marker_file = debugfs_create_file("marker", 0660, dir, NULL, &fops_marker); +#endif if (!marker_file) pr_err(NAME "marker file creation failed.\n"); @@ -598,7 +480,7 @@ out: mutex_unlock(&mmiotrace_mutex); } -static void disable_mmiotrace(void) +void disable_mmiotrace(void) { mutex_lock(&mmiotrace_mutex); if (!is_enabled()) @@ -615,17 +497,13 @@ static void disable_mmiotrace(void) debugfs_remove(marker_file); marker_file = NULL; } - if (chan) { - relay_close(chan); - chan = NULL; - } pr_info(NAME "disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); } -static int __init init(void) +int __init init_mmiotrace(void) { pr_debug(NAME "load...\n"); if (n_subbufs < 2) @@ -636,31 +514,5 @@ static int __init init(void) pr_err(NAME "Couldn't create relay app directory.\n"); return -ENOMEM; } - - enabled_file = debugfs_create_file("enabled", 0600, dir, NULL, - &fops_enabled); - if (!enabled_file) { - pr_err(NAME "Couldn't create enabled file.\n"); - debugfs_remove(dir); - return -ENOMEM; - } - - if (enable_now) - enable_mmiotrace(); - return 0; } - -static void __exit cleanup(void) -{ - pr_debug(NAME "unload...\n"); - if (enabled_file) - debugfs_remove(enabled_file); - disable_mmiotrace(); - if (dir) - debugfs_remove(dir); -} - -module_init(init); -module_exit(cleanup); -MODULE_LICENSE("GPL"); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index cb5efd0c7f5..579b3b06c90 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -54,6 +54,12 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr) } #endif /* CONFIG_MMIOTRACE_HOOKS */ +/* in kernel/trace/trace_mmiotrace.c */ +extern int __init init_mmiotrace(void); +extern void enable_mmiotrace(void); +extern void disable_mmiotrace(void); +extern void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg); + #endif /* __KERNEL__ */ diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d9efbbfa2bd..c44a7dce908 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -18,5 +18,6 @@ obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c new file mode 100644 index 00000000000..e4dd03cc5aa --- /dev/null +++ b/kernel/trace/trace_mmiotrace.c @@ -0,0 +1,84 @@ +/* + * Memory mapped I/O tracing + * + * Copyright (C) 2008 Pekka Paalanen + */ + +#define DEBUG 1 + +#include +#include + +#include "trace.h" + +extern void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3); + +static struct trace_array *mmio_trace_array; + + +static void mmio_trace_init(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_trace_array = tr; + if (tr->ctrl) + enable_mmiotrace(); +} + +static void mmio_trace_reset(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) + disable_mmiotrace(); +} + +static void mmio_trace_ctrl_update(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + if (tr->ctrl) + enable_mmiotrace(); + else + disable_mmiotrace(); +} + +static struct tracer mmio_tracer __read_mostly = +{ + .name = "mmiotrace", + .init = mmio_trace_init, + .reset = mmio_trace_reset, + .ctrl_update = mmio_trace_ctrl_update, +}; + +__init static int init_mmio_trace(void) +{ + int ret = init_mmiotrace(); + if (ret) + return ret; + return register_tracer(&mmio_tracer); +} +device_initcall(init_mmio_trace); + +void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data = tr->data[smp_processor_id()]; + + if (!current || current->pid == 0) { + /* + * XXX: This is a problem. We need to able to record, no + * matter what. tracing_generic_entry_update() would crash. + */ + static unsigned limit; + if (limit++ < 12) + pr_err("Error in %s: no current.\n", __func__); + return; + } + if (!tr || !data) { + static unsigned limit; + if (limit++ < 12) + pr_err("%s: no tr or data\n", __func__); + return; + } + __trace_special(tr, data, type, addr, arg); +} -- cgit v1.2.3-70-g09d2 From bd8ac686c73c7e925fcfe0b02dc4e7b947127864 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:57 +0200 Subject: ftrace: mmiotrace, updates here is a patch that makes mmiotrace work almost well within the tracing framework. The patch applies on top of my previous patch. I have my own output formatting in place now. Summary of changes: - fix the NULL dereference that was due to not calling tracing_reset() - add print_line() callback into struct tracer - implement print_line() for mmiotrace, producing up-to-spec text - add my output header, but that is not really called in the right place - rewrote the main structs in mmiotrace - added two new trace entry types: TRACE_MMIO_RW and TRACE_MMIO_MAP - made some functions in trace.c non-static - check current==NULL in tracing_generic_entry_update() - fix(?) comparison in trace_seq_printf() Things seem to work fine except a few issues. Markers (text lines injected into mmiotrace log) are missing, I did not feel hacking them in before we have variable length entries. My output header is printed only for 'trace' file, but not 'trace_pipe'. For some reason, despite my quick fix, iter->trace is NULL in print_trace_line() when called from 'trace_pipe' file, which means I don't get proper output formatting. I only tried by loading nouveau.ko, which just detects the card, and that is traced fine. I didn't try further. Map, two reads and unmap. Works perfectly. I am missing the information about overflows, I'd prefer to have a counter for lost events. I didn't try, but I guess currently there is no way of knowning when it overflows? So, not too far from being fully operational, it seems :-) And looking at the diffstat, there also is some 700-900 lines of user space code that just became obsolete. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig.debug | 2 +- arch/x86/kernel/mmiotrace/mmio-mod.c | 140 ++++++++++---------------------- include/linux/mmiotrace.h | 85 ++++++-------------- kernel/trace/trace.c | 34 ++++++++ kernel/trace/trace.h | 14 ++++ kernel/trace/trace_mmiotrace.c | 151 ++++++++++++++++++++++++++++------- 6 files changed, 238 insertions(+), 188 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 7e4b8494078..1d6de0d67f9 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -173,7 +173,7 @@ config MMIOTRACE_HOOKS config MMIOTRACE bool "Memory mapped IO tracing" - depends on DEBUG_KERNEL && RELAY + depends on DEBUG_KERNEL select TRACING select MMIOTRACE_HOOKS default y diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index c7a67d7e482..62abc281a51 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -37,11 +37,6 @@ #define NAME "mmiotrace: " -/* This app's relay channel files will appear in /debug/mmio-trace */ -static const char APP_DIR[] = "mmio-trace"; -/* the marker injection file in /debug/APP_DIR */ -static const char MARKER_FILE[] = "mmio-marker"; - struct trap_reason { unsigned long addr; unsigned long ip; @@ -56,18 +51,15 @@ struct remap_trace { unsigned long id; }; -static const size_t subbuf_size = 256*1024; - /* Accessed per-cpu. */ static DEFINE_PER_CPU(struct trap_reason, pf_reason); -static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace); +static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); #if 0 /* XXX: no way gather this info anymore */ /* Access to this is not per-cpu. */ static DEFINE_PER_CPU(atomic_t, dropped); #endif -static struct dentry *dir; static struct dentry *marker_file; static DEFINE_MUTEX(mmiotrace_mutex); @@ -82,24 +74,21 @@ static LIST_HEAD(trace_list); /* struct remap_trace */ * and trace_lock. * - Routines depending on is_enabled() must take trace_lock. * - trace_list users must hold trace_lock. - * - is_enabled() guarantees that chan is valid. + * - is_enabled() guarantees that mmio_trace_record is allowed. * - pre/post callbacks assume the effect of is_enabled() being true. */ /* module parameters */ -static unsigned int n_subbufs = 32*4; static unsigned long filter_offset; static int nommiotrace; static int ISA_trace; static int trace_pc; -module_param(n_subbufs, uint, 0); module_param(filter_offset, ulong, 0); module_param(nommiotrace, bool, 0); module_param(ISA_trace, bool, 0); module_param(trace_pc, bool, 0); -MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128."); MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range."); @@ -110,6 +99,7 @@ static bool is_enabled(void) return atomic_read(&mmiotrace_enabled); } +#if 0 /* XXX: needs rewrite */ /* * Write callback for the debugfs entry: * Read a marker and write it to the mmio trace log @@ -145,6 +135,7 @@ static ssize_t write_marker(struct file *file, const char __user *buffer, kfree(event); return len; } +#endif static void print_pte(unsigned long address) { @@ -198,9 +189,10 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, unsigned long addr) { struct trap_reason *my_reason = &get_cpu_var(pf_reason); - struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); const unsigned long instptr = instruction_pointer(regs); const enum reason_type type = get_ins_type(instptr); + struct remap_trace *trace = p->user_data; /* it doesn't make sense to have more than one active trace per cpu */ if (my_reason->active_traces) @@ -212,23 +204,17 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, my_reason->addr = addr; my_reason->ip = instptr; - my_trace->header.type = MMIO_MAGIC; - my_trace->header.pid = 0; - my_trace->header.data_len = sizeof(struct mm_io_rw); - my_trace->rw.address = addr; - /* - * struct remap_trace *trace = p->user_data; - * phys = addr - trace->probe.addr + trace->phys; - */ + my_trace->phys = addr - trace->probe.addr + trace->phys; + my_trace->map_id = trace->id; /* * Only record the program counter when requested. * It may taint clean-room reverse engineering. */ if (trace_pc) - my_trace->rw.pc = instptr; + my_trace->pc = instptr; else - my_trace->rw.pc = 0; + my_trace->pc = 0; /* * XXX: the timestamp recorded will be *after* the tracing has been @@ -238,28 +224,25 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, switch (type) { case REG_READ: - my_trace->header.type |= - (MMIO_READ << MMIO_OPCODE_SHIFT) | - (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); + my_trace->opcode = MMIO_READ; + my_trace->width = get_ins_mem_width(instptr); break; case REG_WRITE: - my_trace->header.type |= - (MMIO_WRITE << MMIO_OPCODE_SHIFT) | - (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); - my_trace->rw.value = get_ins_reg_val(instptr, regs); + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_reg_val(instptr, regs); break; case IMM_WRITE: - my_trace->header.type |= - (MMIO_WRITE << MMIO_OPCODE_SHIFT) | - (get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT); - my_trace->rw.value = get_ins_imm_val(instptr); + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_imm_val(instptr); break; default: { unsigned char *ip = (unsigned char *)instptr; - my_trace->header.type |= - (MMIO_UNKNOWN_OP << MMIO_OPCODE_SHIFT); - my_trace->rw.value = (*ip) << 16 | *(ip + 1) << 8 | + my_trace->opcode = MMIO_UNKNOWN_OP; + my_trace->width = 0; + my_trace->value = (*ip) << 16 | *(ip + 1) << 8 | *(ip + 2); } } @@ -271,7 +254,7 @@ static void post(struct kmmio_probe *p, unsigned long condition, struct pt_regs *regs) { struct trap_reason *my_reason = &get_cpu_var(pf_reason); - struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); /* this should always return the active_trace count to 0 */ my_reason->active_traces--; @@ -282,20 +265,13 @@ static void post(struct kmmio_probe *p, unsigned long condition, switch (my_reason->type) { case REG_READ: - my_trace->rw.value = get_ins_reg_val(my_reason->ip, regs); + my_trace->value = get_ins_reg_val(my_reason->ip, regs); break; default: break; } - /* - * XXX: Several required values are ignored: - * - mapping id - * - program counter - * Also the address should be physical, not virtual. - */ - mmio_trace_record(my_trace->header.type, my_trace->rw.address, - my_trace->rw.value); + mmio_trace_rw(my_trace); put_cpu_var(cpu_trace); put_cpu_var(pf_reason); } @@ -305,21 +281,11 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, { static atomic_t next_id; struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); - struct mm_io_header_map event = { - .header = { - .type = MMIO_MAGIC | - (MMIO_PROBE << MMIO_OPCODE_SHIFT), - .sec = 0, - .nsec = 0, - .pid = 0, - .data_len = sizeof(struct mm_io_map) - }, - .map = { - .phys = offset, - .addr = (unsigned long)addr, - .len = size, - .pc = 0 - } + struct mmiotrace_map map = { + .phys = offset, + .virt = (unsigned long)addr, + .len = size, + .opcode = MMIO_PROBE }; if (!trace) { @@ -338,15 +304,13 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size, .phys = offset, .id = atomic_inc_return(&next_id) }; + map.map_id = trace->id; spin_lock_irq(&trace_lock); if (!is_enabled()) goto not_enabled; - /* - * XXX: Insufficient data recorded! - */ - mmio_trace_record(event.header.type, event.map.addr, event.map.len); + mmio_trace_mapping(&map); list_add_tail(&trace->list, &trace_list); if (!nommiotrace) register_kmmio_probe(&trace->probe); @@ -369,21 +333,11 @@ mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) static void iounmap_trace_core(volatile void __iomem *addr) { - struct mm_io_header_map event = { - .header = { - .type = MMIO_MAGIC | - (MMIO_UNPROBE << MMIO_OPCODE_SHIFT), - .sec = 0, - .nsec = 0, - .pid = 0, - .data_len = sizeof(struct mm_io_map) - }, - .map = { - .phys = 0, - .addr = (unsigned long)addr, - .len = 0, - .pc = 0 - } + struct mmiotrace_map map = { + .phys = 0, + .virt = (unsigned long)addr, + .len = 0, + .opcode = MMIO_UNPROBE }; struct remap_trace *trace; struct remap_trace *tmp; @@ -404,8 +358,8 @@ static void iounmap_trace_core(volatile void __iomem *addr) break; } } - mmio_trace_record(event.header.type, event.map.addr, - found_trace ? found_trace->id : -1); + map.map_id = (found_trace) ? found_trace->id : -1; + mmio_trace_mapping(&map); not_enabled: spin_unlock_irq(&trace_lock); @@ -448,10 +402,12 @@ static void clear_trace_list(void) } } +#if 0 /* XXX: out of order */ static struct file_operations fops_marker = { .owner = THIS_MODULE, .write = write_marker }; +#endif void enable_mmiotrace(void) { @@ -464,9 +420,9 @@ void enable_mmiotrace(void) #if 0 /* XXX: tracing does not support text entries */ marker_file = debugfs_create_file("marker", 0660, dir, NULL, &fops_marker); -#endif if (!marker_file) pr_err(NAME "marker file creation failed.\n"); +#endif if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); @@ -502,17 +458,3 @@ void disable_mmiotrace(void) out: mutex_unlock(&mmiotrace_mutex); } - -int __init init_mmiotrace(void) -{ - pr_debug(NAME "load...\n"); - if (n_subbufs < 2) - return -EINVAL; - - dir = debugfs_create_dir(APP_DIR, NULL); - if (!dir) { - pr_err(NAME "Couldn't create relay app directory.\n"); - return -ENOMEM; - } - return 0; -} diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 579b3b06c90..c88a9c197d2 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -54,73 +54,38 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr) } #endif /* CONFIG_MMIOTRACE_HOOKS */ -/* in kernel/trace/trace_mmiotrace.c */ -extern int __init init_mmiotrace(void); -extern void enable_mmiotrace(void); -extern void disable_mmiotrace(void); -extern void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg); - -#endif /* __KERNEL__ */ - - -/* - * If you change anything here, you must bump MMIO_VERSION. - * This is the relay data format for user space. - */ -#define MMIO_VERSION 0x04 - -/* mm_io_header.type */ -#define MMIO_OPCODE_MASK 0xff -#define MMIO_OPCODE_SHIFT 0 -#define MMIO_WIDTH_MASK 0xff00 -#define MMIO_WIDTH_SHIFT 8 -#define MMIO_MAGIC (0x6f000000 | (MMIO_VERSION<<16)) -#define MMIO_MAGIC_MASK 0xffff0000 - -enum mm_io_opcode { /* payload type: */ - MMIO_READ = 0x1, /* struct mm_io_rw */ - MMIO_WRITE = 0x2, /* struct mm_io_rw */ - MMIO_PROBE = 0x3, /* struct mm_io_map */ - MMIO_UNPROBE = 0x4, /* struct mm_io_map */ +enum mm_io_opcode { + MMIO_READ = 0x1, /* struct mmiotrace_rw */ + MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ + MMIO_PROBE = 0x3, /* struct mmiotrace_map */ + MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ MMIO_MARKER = 0x5, /* raw char data */ - MMIO_UNKNOWN_OP = 0x6, /* struct mm_io_rw */ + MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */ }; -struct mm_io_header { - __u32 type; /* see MMIO_* macros above */ - __u32 sec; /* timestamp */ - __u32 nsec; - __u32 pid; /* PID of the process, or 0 for kernel core */ - __u16 data_len; /* length of the following payload */ +struct mmiotrace_rw { + unsigned long phys; /* PCI address of register */ + unsigned long value; + unsigned long pc; /* optional program counter */ + int map_id; + unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ + unsigned char width; /* size of register access in bytes */ }; -struct mm_io_rw { - __u64 address; /* virtual address of register */ - __u64 value; - __u64 pc; /* optional program counter */ +struct mmiotrace_map { + unsigned long phys; /* base address in PCI space */ + unsigned long virt; /* base virtual address */ + unsigned long len; /* mapping size */ + int map_id; + unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ }; -struct mm_io_map { - __u64 phys; /* base address in PCI space */ - __u64 addr; /* base virtual address */ - __u64 len; /* mapping size */ - __u64 pc; /* optional program counter */ -}; - - -/* - * These structures are used to allow a single relay_write() - * call to write a full packet. - */ - -struct mm_io_header_rw { - struct mm_io_header header; - struct mm_io_rw rw; -} __attribute__((packed)); +/* in kernel/trace/trace_mmiotrace.c */ +extern void enable_mmiotrace(void); +extern void disable_mmiotrace(void); +extern void mmio_trace_rw(struct mmiotrace_rw *rw); +extern void mmio_trace_mapping(struct mmiotrace_map *map); -struct mm_io_header_map { - struct mm_io_header header; - struct mm_io_map map; -} __attribute__((packed)); +#endif /* __KERNEL__ */ #endif /* MMIOTRACE_H */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3271916ff03..d14fe49e963 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -831,6 +831,40 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags); } +#ifdef CONFIG_MMIOTRACE +void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + spin_lock_irqsave(&data->lock, irq_flags); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_RW; + entry->mmiorw = *rw; + spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); +} + +void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct trace_entry *entry; + unsigned long irq_flags; + + spin_lock_irqsave(&data->lock, irq_flags); + entry = tracing_get_trace_entry(tr, data); + tracing_generic_entry_update(entry, 0); + entry->type = TRACE_MMIO_MAP; + entry->mmiomap = *map; + spin_unlock_irqrestore(&data->lock, irq_flags); + + trace_wake_up(); +} +#endif + void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c460e85e94e..0ef9ef74c80 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -5,6 +5,7 @@ #include #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -14,6 +15,8 @@ enum trace_type { TRACE_WAKE, TRACE_STACK, TRACE_SPECIAL, + TRACE_MMIO_RW, + TRACE_MMIO_MAP, __TRACE_LAST_TYPE }; @@ -75,6 +78,8 @@ struct trace_entry { struct ctx_switch_entry ctx; struct special_entry special; struct stack_entry stack; + struct mmiotrace_rw mmiorw; + struct mmiotrace_map mmiomap; }; }; @@ -255,6 +260,15 @@ extern unsigned long ftrace_update_tot_cnt; extern int DYN_FTRACE_TEST_NAME(void); #endif +#ifdef CONFIG_MMIOTRACE +extern void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw); +extern void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map); +#endif + #ifdef CONFIG_FTRACE_STARTUP_TEST #ifdef CONFIG_FTRACE extern int trace_selftest_startup_function(struct tracer *trace, diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index e4dd03cc5aa..3a12b1ad0c6 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -11,19 +11,26 @@ #include "trace.h" -extern void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3); - static struct trace_array *mmio_trace_array; +static void mmio_reset_data(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr->data[cpu]); +} static void mmio_trace_init(struct trace_array *tr) { pr_debug("in %s\n", __func__); mmio_trace_array = tr; - if (tr->ctrl) + if (tr->ctrl) { + mmio_reset_data(tr); enable_mmiotrace(); + } } static void mmio_trace_reset(struct trace_array *tr) @@ -31,15 +38,110 @@ static void mmio_trace_reset(struct trace_array *tr) pr_debug("in %s\n", __func__); if (tr->ctrl) disable_mmiotrace(); + mmio_reset_data(tr); + mmio_trace_array = NULL; } static void mmio_trace_ctrl_update(struct trace_array *tr) { pr_debug("in %s\n", __func__); - if (tr->ctrl) + if (tr->ctrl) { + mmio_reset_data(tr); enable_mmiotrace(); - else + } else { disable_mmiotrace(); + } +} + +/* XXX: This is not called for trace_pipe file! */ +void mmio_print_header(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + trace_seq_printf(s, "VERSION broken 20070824\n"); + /* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */ +} + +static int mmio_print_rw(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_rw *rw = &entry->mmiorw; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_READ: + ret = trace_seq_printf(s, + "R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, rw->phys, + rw->value, rw->pc, entry->pid); + break; + case MMIO_WRITE: + ret = trace_seq_printf(s, + "W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, rw->phys, + rw->value, rw->pc, entry->pid); + break; + case MMIO_UNKNOWN_OP: + ret = trace_seq_printf(s, + "UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n", + secs, usec_rem, rw->map_id, rw->phys, + (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, + (rw->value >> 0) & 0xff, rw->pc, entry->pid); + break; + default: + ret = trace_seq_printf(s, "rw what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +static int mmio_print_map(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct mmiotrace_map *m = &entry->mmiomap; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(entry->t); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret = 1; + + switch (entry->mmiorw.opcode) { + case MMIO_PROBE: + ret = trace_seq_printf(s, + "MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, m->phys, m->virt, m->len, + 0UL, entry->pid); + break; + case MMIO_UNPROBE: + ret = trace_seq_printf(s, + "UNMAP %lu.%06lu %d 0x%lx %d\n", + secs, usec_rem, m->map_id, 0UL, entry->pid); + break; + default: + ret = trace_seq_printf(s, "map what?\n"); + break; + } + if (ret) + return 1; + return 0; +} + +/* return 0 to abort printing without consuming current entry in pipe mode */ +static int mmio_print_line(struct trace_iterator *iter) +{ + switch (iter->ent->type) { + case TRACE_MMIO_RW: + return mmio_print_rw(iter); + case TRACE_MMIO_MAP: + return mmio_print_map(iter); + default: + return 1; /* ignore unknown entries */ + } } static struct tracer mmio_tracer __read_mostly = @@ -47,38 +149,31 @@ static struct tracer mmio_tracer __read_mostly = .name = "mmiotrace", .init = mmio_trace_init, .reset = mmio_trace_reset, + .open = mmio_print_header, .ctrl_update = mmio_trace_ctrl_update, + .print_line = mmio_print_line, }; __init static int init_mmio_trace(void) { - int ret = init_mmiotrace(); - if (ret) - return ret; return register_tracer(&mmio_tracer); } device_initcall(init_mmio_trace); -void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg) +void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; struct trace_array_cpu *data = tr->data[smp_processor_id()]; + __trace_mmiotrace_rw(tr, data, rw); +} - if (!current || current->pid == 0) { - /* - * XXX: This is a problem. We need to able to record, no - * matter what. tracing_generic_entry_update() would crash. - */ - static unsigned limit; - if (limit++ < 12) - pr_err("Error in %s: no current.\n", __func__); - return; - } - if (!tr || !data) { - static unsigned limit; - if (limit++ < 12) - pr_err("%s: no tr or data\n", __func__); - return; - } - __trace_special(tr, data, type, addr, arg); +void mmio_trace_mapping(struct mmiotrace_map *map) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data; + + preempt_disable(); + data = tr->data[smp_processor_id()]; + __trace_mmiotrace_map(tr, data, map); + preempt_enable(); } -- cgit v1.2.3-70-g09d2 From 138295373ccf7625fcb0218dfea114837983bc39 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:20:58 +0200 Subject: ftrace: mmiotrace update, #2 another weekend, another patch. This should apply on top of my previous patch from March 23rd. Summary of changes: - Print PCI device list in output header - work around recursive probe hits on SMP - refactor dis/arm_kmmio_fault_page() and add check for page levels - remove un/reference_kmmio(), the die notifier hook is registered permanently into the list - explicitly check for single stepping in die notifier callback I have tested this version on my UP Athlon64 desktop with Nouveau, and SMP Core 2 Duo laptop with the proprietary nvidia driver. Both systems are 64-bit. One previously unknown bug crept into daylight: the ftrace framework's output routines print the first entry last after buffer has wrapped around. The most important regressions compared to non-ftrace mmiotrace at this time are: - failure of trace_pipe file - illegal lines in output file - unaware of losing data due to buffer full Personally I'd like to see these three solved before submitting to mainline. Other issues may come up once we know when we lose events. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/mmiotrace/kmmio.c | 186 ++++++++++++++--------------------- arch/x86/kernel/mmiotrace/mmio-mod.c | 3 - include/linux/mmiotrace.h | 2 - kernel/trace/trace_mmiotrace.c | 47 ++++++++- 4 files changed, 120 insertions(+), 118 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c index efb46793308..cd0d95fe4fe 100644 --- a/arch/x86/kernel/mmiotrace/kmmio.c +++ b/arch/x86/kernel/mmiotrace/kmmio.c @@ -5,15 +5,12 @@ * 2008 Pekka Paalanen */ -#include #include #include #include #include #include -#include #include -#include #include #include #include @@ -22,10 +19,9 @@ #include #include #include -#include #include -#include - +#include +#include #include #define KMMIO_PAGE_HASH_BITS 4 @@ -57,14 +53,9 @@ struct kmmio_context { int active; }; -static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, - void *args); - -static DEFINE_MUTEX(kmmio_init_mutex); static DEFINE_SPINLOCK(kmmio_lock); -/* These are protected by kmmio_lock */ -static int kmmio_initialized; +/* Protected by kmmio_lock */ unsigned int kmmio_count; /* Read-protected by RCU, write-protected by kmmio_lock. */ @@ -79,60 +70,6 @@ static struct list_head *kmmio_page_list(unsigned long page) /* Accessed per-cpu */ static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); -/* protected by kmmio_init_mutex */ -static struct notifier_block nb_die = { - .notifier_call = kmmio_die_notifier -}; - -/** - * Makes sure kmmio is initialized and usable. - * This must be called before any other kmmio function defined here. - * May sleep. - */ -void reference_kmmio(void) -{ - mutex_lock(&kmmio_init_mutex); - spin_lock_irq(&kmmio_lock); - if (!kmmio_initialized) { - int i; - for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) - INIT_LIST_HEAD(&kmmio_page_table[i]); - if (register_die_notifier(&nb_die)) - BUG(); - } - kmmio_initialized++; - spin_unlock_irq(&kmmio_lock); - mutex_unlock(&kmmio_init_mutex); -} -EXPORT_SYMBOL_GPL(reference_kmmio); - -/** - * Clean up kmmio after use. This must be called for every call to - * reference_kmmio(). All probes registered after the corresponding - * reference_kmmio() must have been unregistered when calling this. - * May sleep. - */ -void unreference_kmmio(void) -{ - bool unreg = false; - - mutex_lock(&kmmio_init_mutex); - spin_lock_irq(&kmmio_lock); - - if (kmmio_initialized == 1) { - BUG_ON(is_kmmio_active()); - unreg = true; - } - kmmio_initialized--; - BUG_ON(kmmio_initialized < 0); - spin_unlock_irq(&kmmio_lock); - - if (unreg) - unregister_die_notifier(&nb_die); /* calls sync_rcu() */ - mutex_unlock(&kmmio_init_mutex); -} -EXPORT_SYMBOL(unreference_kmmio); - /* * this is basically a dynamic stabbing problem: * Could use the existing prio tree code or @@ -167,58 +104,56 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -/** Mark the given page as not present. Access to it will trigger a fault. */ -static void arm_kmmio_fault_page(unsigned long page, int *page_level) +static void set_page_present(unsigned long addr, bool present, int *pglevel) { - unsigned long address = page & PAGE_MASK; + pteval_t pteval; + pmdval_t pmdval; int level; - pte_t *pte = lookup_address(address, &level); + pmd_t *pmd; + pte_t *pte = lookup_address(addr, &level); if (!pte) { - pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n", - __func__, page); + pr_err("kmmio: no pte for page 0x%08lx\n", addr); return; } - if (level == PG_LEVEL_2M) { - pmd_t *pmd = (pmd_t *)pte; - set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_PRESENT)); - } else { - /* PG_LEVEL_4K */ - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + if (pglevel) + *pglevel = level; + + switch (level) { + case PG_LEVEL_2M: + pmd = (pmd_t *)pte; + pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; + if (present) + pmdval |= _PAGE_PRESENT; + set_pmd(pmd, __pmd(pmdval)); + break; + + case PG_LEVEL_4K: + pteval = pte_val(*pte) & ~_PAGE_PRESENT; + if (present) + pteval |= _PAGE_PRESENT; + set_pte_atomic(pte, __pte(pteval)); + break; + + default: + pr_err("kmmio: unexpected page level 0x%x.\n", level); + return; } - if (page_level) - *page_level = level; + __flush_tlb_one(addr); +} - __flush_tlb_one(page); +/** Mark the given page as not present. Access to it will trigger a fault. */ +static void arm_kmmio_fault_page(unsigned long page, int *page_level) +{ + set_page_present(page & PAGE_MASK, false, page_level); } /** Mark the given page as present. */ static void disarm_kmmio_fault_page(unsigned long page, int *page_level) { - unsigned long address = page & PAGE_MASK; - int level; - pte_t *pte = lookup_address(address, &level); - - if (!pte) { - pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n", - __func__, page); - return; - } - - if (level == PG_LEVEL_2M) { - pmd_t *pmd = (pmd_t *)pte; - set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_PRESENT)); - } else { - /* PG_LEVEL_4K */ - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - } - - if (page_level) - *page_level = level; - - __flush_tlb_one(page); + set_page_present(page & PAGE_MASK, true, page_level); } /* @@ -240,6 +175,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) { struct kmmio_context *ctx; struct kmmio_fault_page *faultpage; + int ret = 0; /* default to fault not handled */ /* * Preemption is now disabled to prevent process switch during @@ -257,21 +193,35 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) /* * Either this page fault is not caused by kmmio, or * another CPU just pulled the kmmio probe from under - * our feet. In the latter case all hell breaks loose. + * our feet. The latter case should not be possible. */ goto no_kmmio; } ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { + disarm_kmmio_fault_page(faultpage->page, NULL); + if (addr == ctx->addr) { + /* + * On SMP we sometimes get recursive probe hits on the + * same address. Context is already saved, fall out. + */ + pr_debug("kmmio: duplicate probe hit on CPU %d, for " + "address 0x%08lx.\n", + smp_processor_id(), addr); + ret = 1; + goto no_kmmio_ctx; + } /* * Prevent overwriting already in-flight context. - * If this page fault really was due to kmmio trap, - * all hell breaks loose. + * This should not happen, let's hope disarming at least + * prevents a panic. */ pr_emerg("kmmio: recursive probe hit on CPU %d, " "for address 0x%08lx. Ignoring.\n", smp_processor_id(), addr); + pr_emerg("kmmio: previous hit was at 0x%08lx.\n", + ctx->addr); goto no_kmmio_ctx; } ctx->active++; @@ -302,14 +252,14 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) */ put_cpu_var(kmmio_ctx); - return 1; + return 1; /* fault handled */ no_kmmio_ctx: put_cpu_var(kmmio_ctx); no_kmmio: rcu_read_unlock(); preempt_enable_no_resched(); - return 0; /* page fault not handled by kmmio */ + return ret; } /* @@ -322,8 +272,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) int ret = 0; struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); - if (!ctx->active) + if (!ctx->active) { + pr_debug("kmmio: spurious debug trap on CPU %d.\n", + smp_processor_id()); goto out; + } if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); @@ -525,9 +478,22 @@ static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, { struct die_args *arg = args; - if (val == DIE_DEBUG) + if (val == DIE_DEBUG && (arg->err & DR_STEP)) if (post_kmmio_handler(arg->err, arg->regs) == 1) return NOTIFY_STOP; return NOTIFY_DONE; } + +static struct notifier_block nb_die = { + .notifier_call = kmmio_die_notifier +}; + +static int __init init_kmmio(void) +{ + int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kmmio_page_table[i]); + return register_die_notifier(&nb_die); +} +fs_initcall(init_kmmio); /* should be before device_initcall() */ diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c index 62abc281a51..8256546d49b 100644 --- a/arch/x86/kernel/mmiotrace/mmio-mod.c +++ b/arch/x86/kernel/mmiotrace/mmio-mod.c @@ -415,8 +415,6 @@ void enable_mmiotrace(void) if (is_enabled()) goto out; - reference_kmmio(); - #if 0 /* XXX: tracing does not support text entries */ marker_file = debugfs_create_file("marker", 0660, dir, NULL, &fops_marker); @@ -448,7 +446,6 @@ void disable_mmiotrace(void) spin_unlock_irq(&trace_lock); clear_trace_list(); /* guarantees: no more kmmio callbacks */ - unreference_kmmio(); if (marker_file) { debugfs_remove(marker_file); marker_file = NULL; diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index c88a9c197d2..dd6b64b160f 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -31,8 +31,6 @@ static inline int is_kmmio_active(void) return kmmio_count; } -extern void reference_kmmio(void); -extern void unreference_kmmio(void); extern int register_kmmio_probe(struct kmmio_probe *p); extern void unregister_kmmio_probe(struct kmmio_probe *p); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 3a12b1ad0c6..361472b5788 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -8,6 +8,7 @@ #include #include +#include #include "trace.h" @@ -53,12 +54,52 @@ static void mmio_trace_ctrl_update(struct trace_array *tr) } } +static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +{ + int ret = 0; + int i; + resource_size_t start, end; + const struct pci_driver *drv = pci_dev_driver(dev); + + /* XXX: incomplete checks for trace_seq_printf() return value */ + ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); + /* + * XXX: is pci_resource_to_user() appropriate, since we are + * supposed to interpret the __ioremap() phys_addr argument based on + * these printed values? + */ + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + (unsigned long long)(start | + (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); + } + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + dev->resource[i].start < dev->resource[i].end ? + (unsigned long long)(end - start) + 1 : 0); + } + if (drv) + ret += trace_seq_printf(s, " %s\n", drv->name); + else + ret += trace_seq_printf(s, " \n"); + return ret; +} + /* XXX: This is not called for trace_pipe file! */ -void mmio_print_header(struct trace_iterator *iter) +static void mmio_print_header(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; - trace_seq_printf(s, "VERSION broken 20070824\n"); - /* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */ + struct pci_dev *dev = NULL; + + trace_seq_printf(s, "VERSION 20070824\n"); + + for_each_pci_dev(dev) + mmio_print_pcidev(s, dev); + /* XXX: return value? What if header is very long? */ } static int mmio_print_rw(struct trace_iterator *iter) -- cgit v1.2.3-70-g09d2 From 970e6fa03885f32cc43e42cb08c73a5f54cd8bd9 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:03 +0200 Subject: mmiotrace: code style cleanups From c2da03771e29159627c5c7b9509ec70bce9f91ee Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 28 Apr 2008 21:25:22 +0300 Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 4 ++-- arch/x86/mm/mmio-mod.c | 7 +++---- arch/x86/mm/testmmiotrace.c | 2 +- include/linux/mmiotrace.h | 6 +----- 4 files changed, 7 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 3ad27b8504a..6a92d9111b6 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -17,10 +17,10 @@ #include #include #include -#include +#include #include #include -#include +#include #include #include diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 1f77d853203..a8d2a0019da 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -418,11 +418,10 @@ static void enter_uniprocessor(void) for_each_cpu_mask(cpu, downed_cpus) { err = cpu_down(cpu); - if (!err) { + if (!err) pr_info(NAME "CPU%d is down.\n", cpu); - } else { + else pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); - } } if (num_online_cpus() > 1) pr_warning(NAME "multiple CPUs still online, " diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index cfa60b227c8..d877c5b423e 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -2,7 +2,7 @@ * Written by Pekka Paalanen, 2008 */ #include -#include +#include #define MODULE_NAME "testmmiotrace" diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index dd6b64b160f..de8e91258da 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -1,9 +1,7 @@ #ifndef MMIOTRACE_H #define MMIOTRACE_H -#include - -#ifdef __KERNEL__ +#include #include @@ -84,6 +82,4 @@ extern void disable_mmiotrace(void); extern void mmio_trace_rw(struct mmiotrace_rw *rw); extern void mmio_trace_mapping(struct mmiotrace_map *map); -#endif /* __KERNEL__ */ - #endif /* MMIOTRACE_H */ -- cgit v1.2.3-70-g09d2 From dee310d0adf41019aca476052ac3085ff286d9be Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:03 +0200 Subject: x86 mmiotrace: use resource_size_t for phys addresses Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- arch/x86/mm/mmio-mod.c | 11 ++++++----- include/linux/mmiotrace.h | 14 +++++++------- kernel/trace/trace_mmiotrace.c | 20 ++++++++++++-------- 3 files changed, 25 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 278998c1998..3b04a012612 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -48,7 +48,7 @@ struct trap_reason { struct remap_trace { struct list_head list; struct kmmio_probe probe; - unsigned long phys; + resource_size_t phys; unsigned long id; }; @@ -275,7 +275,7 @@ static void post(struct kmmio_probe *p, unsigned long condition, put_cpu_var(pf_reason); } -static void ioremap_trace_core(unsigned long offset, unsigned long size, +static void ioremap_trace_core(resource_size_t offset, unsigned long size, void __iomem *addr) { static atomic_t next_id; @@ -319,13 +319,14 @@ not_enabled: spin_unlock_irq(&trace_lock); } -void -mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) +void mmiotrace_ioremap(resource_size_t offset, unsigned long size, + void __iomem *addr) { if (!is_enabled()) /* recheck and proper locking in *_core() */ return; - pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr); + pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n", + (unsigned long long)offset, size, addr); if ((filter_offset) && (offset != filter_offset)) return; ioremap_trace_core(offset, size, addr); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index de8e91258da..5cbbc374e94 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -2,7 +2,6 @@ #define MMIOTRACE_H #include - #include struct kmmio_probe; @@ -37,14 +36,15 @@ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); /* Called from ioremap.c */ #ifdef CONFIG_MMIOTRACE -extern void -mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr); +extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, + void __iomem *addr); extern void mmiotrace_iounmap(volatile void __iomem *addr); #else -static inline void -mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr) +static inline void mmiotrace_ioremap(resource_size_t offset, + unsigned long size, void __iomem *addr) { } + static inline void mmiotrace_iounmap(volatile void __iomem *addr) { } @@ -60,7 +60,7 @@ enum mm_io_opcode { }; struct mmiotrace_rw { - unsigned long phys; /* PCI address of register */ + resource_size_t phys; /* PCI address of register */ unsigned long value; unsigned long pc; /* optional program counter */ int map_id; @@ -69,7 +69,7 @@ struct mmiotrace_rw { }; struct mmiotrace_map { - unsigned long phys; /* base address in PCI space */ + resource_size_t phys; /* base address in PCI space */ unsigned long virt; /* base virtual address */ unsigned long len; /* mapping size */ int map_id; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 3c1dacdc2d8..b13dc19dcbb 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -184,20 +184,23 @@ static int mmio_print_rw(struct trace_iterator *iter) switch (entry->mmiorw.opcode) { case MMIO_READ: ret = trace_seq_printf(s, - "R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", - rw->width, secs, usec_rem, rw->map_id, rw->phys, + "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_WRITE: ret = trace_seq_printf(s, - "W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n", - rw->width, secs, usec_rem, rw->map_id, rw->phys, + "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_UNKNOWN_OP: ret = trace_seq_printf(s, - "UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n", - secs, usec_rem, rw->map_id, rw->phys, + "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", + secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, (rw->value >> 0) & 0xff, rw->pc, 0); break; @@ -223,8 +226,9 @@ static int mmio_print_map(struct trace_iterator *iter) switch (entry->mmiorw.opcode) { case MMIO_PROBE: ret = trace_seq_printf(s, - "MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n", - secs, usec_rem, m->map_id, m->phys, m->virt, m->len, + "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, + (unsigned long long)m->phys, m->virt, m->len, 0UL, 0); break; case MMIO_UNPROBE: -- cgit v1.2.3-70-g09d2 From a50445d76c22a34ae149704ea5adaef171c8acb7 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 12 May 2008 21:21:03 +0200 Subject: mmiotrace: rename kmmio_probe::user_data to :private. Signed-off-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- arch/x86/mm/mmio-mod.c | 4 ++-- include/linux/mmiotrace.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 3b04a012612..ed0e0e90b3e 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -191,7 +191,7 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs, struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); const unsigned long instptr = instruction_pointer(regs); const enum reason_type type = get_ins_type(instptr); - struct remap_trace *trace = p->user_data; + struct remap_trace *trace = p->private; /* it doesn't make sense to have more than one active trace per cpu */ if (my_reason->active_traces) @@ -299,7 +299,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size, .len = size, .pre_handler = pre, .post_handler = post, - .user_data = trace + .private = trace }, .phys = offset, .id = atomic_inc_return(&next_id) diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 5cbbc374e94..61d19e1b7a0 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -18,7 +18,7 @@ struct kmmio_probe { unsigned long len; /* length of the probe region */ kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ kmmio_post_handler_t post_handler; /* Called after addr is executed */ - void *user_data; + void *private; }; /* kmmio is active by some kmmio_probes? */ -- cgit v1.2.3-70-g09d2 From 42fdfa238a23643226910acf922ea930b3286032 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 23:14:51 +0200 Subject: namespacecheck: more kernel/printk.c fixes [ Stephen Rothwell : build fix ] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/kernel.h | 6 ------ kernel/printk.c | 13 ------------- 2 files changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 792bf0aa779..f2a668c195b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -184,9 +184,6 @@ asmlinkage int vprintk(const char *fmt, va_list args) __attribute__ ((format (printf, 1, 0))); asmlinkage int printk(const char * fmt, ...) __attribute__ ((format (printf, 1, 2))) __cold; -extern int log_buf_get_len(void); -extern int log_buf_read(int idx); -extern int log_buf_copy(char *dest, int idx, int len); extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; @@ -202,9 +199,6 @@ static inline int vprintk(const char *s, va_list args) { return 0; } static inline int printk(const char *s, ...) __attribute__ ((format (printf, 1, 2))); static inline int __cold printk(const char *s, ...) { return 0; } -static inline int log_buf_get_len(void) { return 0; } -static inline int log_buf_read(int idx) { return 0; } -static inline int log_buf_copy(char *dest, int idx, int len) { return 0; } static inline int printk_ratelimit(void) { return 0; } static inline int __printk_ratelimit(int ratelimit_jiffies, \ int ratelimit_burst) { return 0; } diff --git a/kernel/printk.c b/kernel/printk.c index b620e3d9613..55d16e57499 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -267,19 +267,6 @@ int log_buf_copy(char *dest, int idx, int len) return ret; } -/* - * Extract a single character from the log buffer. - */ -static int log_buf_read(int idx) -{ - char ret; - - if (log_buf_copy(&ret, idx, 1) == 1) - return ret; - else - return -1; -} - /* * Commands to do_syslog: * -- cgit v1.2.3-70-g09d2 From 63687a528c39a67c1a213cdffa09feb0e6af9dbe Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 12 May 2008 15:44:41 +0200 Subject: x86: move tracedata to RODATA .. allowing it to be write-protected just as other read-only data under CONFIG_DEBUG_RODATA. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vmlinux_32.lds.S | 7 ------- arch/x86/kernel/vmlinux_64.lds.S | 7 ------- drivers/base/power/trace.c | 2 +- include/asm-generic/vmlinux.lds.h | 14 ++++++++++++++ include/asm-x86/resume-trace.h | 2 +- include/linux/resume-trace.h | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index ce5ed083a1e..2674f579627 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -60,13 +60,6 @@ SECTIONS BUG_TABLE :text - . = ALIGN(4); - .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { - __tracedata_start = .; - *(.tracedata) - __tracedata_end = .; - } - RODATA /* writeable */ diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index fad3674b06a..687041bfbae 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -53,13 +53,6 @@ SECTIONS RODATA - . = ALIGN(4); - .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { - __tracedata_start = .; - *(.tracedata) - __tracedata_end = .; - } - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index 2b4b392dcbc..87a7f1d0257 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(set_trace_device); * it's not any guarantee, but it's a high _likelihood_ that * the match is valid). */ -void generate_resume_trace(void *tracedata, unsigned int user) +void generate_resume_trace(const void *tracedata, unsigned int user) { unsigned short lineno = *(unsigned short *)tracedata; const char *file = *(const char **)(tracedata + 2); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f054778e916..f1992dc5c42 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -93,6 +93,8 @@ VMLINUX_SYMBOL(__end_rio_route_ops) = .; \ } \ \ + TRACEDATA \ + \ /* Kernel symbol table: Normal symbols */ \ __ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start___ksymtab) = .; \ @@ -318,6 +320,18 @@ __stop___bug_table = .; \ } +#ifdef CONFIG_PM_TRACE +#define TRACEDATA \ + . = ALIGN(4); \ + .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { \ + __tracedata_start = .; \ + *(.tracedata) \ + __tracedata_end = .; \ + } +#else +#define TRACEDATA +#endif + #define NOTES \ .notes : AT(ADDR(.notes) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_notes) = .; \ diff --git a/include/asm-x86/resume-trace.h b/include/asm-x86/resume-trace.h index 2557514d7ef..8d9f0b41ee8 100644 --- a/include/asm-x86/resume-trace.h +++ b/include/asm-x86/resume-trace.h @@ -6,7 +6,7 @@ #define TRACE_RESUME(user) \ do { \ if (pm_trace_enabled) { \ - void *tracedata; \ + const void *tracedata; \ asm volatile(_ASM_MOV_UL " $1f,%0\n" \ ".section .tracedata,\"a\"\n" \ "1:\t.word %c1\n\t" \ diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h index f3f4f28c696..c9ba2fdf807 100644 --- a/include/linux/resume-trace.h +++ b/include/linux/resume-trace.h @@ -8,7 +8,7 @@ extern int pm_trace_enabled; struct device; extern void set_trace_device(struct device *); -extern void generate_resume_trace(void *tracedata, unsigned int user); +extern void generate_resume_trace(const void *tracedata, unsigned int user); #define TRACE_DEVICE(dev) do { \ if (pm_trace_enabled) \ -- cgit v1.2.3-70-g09d2 From 962cf36c5bf6d2840b8d66ee9a606fae2f540bbd Mon Sep 17 00:00:00 2001 From: "Carlos R. Mafra" Date: Thu, 15 May 2008 11:15:37 -0300 Subject: Remove argument from open_softirq which is always NULL As git-grep shows, open_softirq() is always called with the last argument being NULL block/blk-core.c: open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); kernel/hrtimer.c: open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); kernel/rcuclassic.c: open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); kernel/rcupreempt.c: open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); kernel/sched.c: open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); kernel/softirq.c: open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); kernel/softirq.c: open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); kernel/timer.c: open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); net/core/dev.c: open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); net/core/dev.c: open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); This observation has already been made by Matthew Wilcox in June 2002 (http://www.cs.helsinki.fi/linux/linux-kernel/2002-25/0687.html) "I notice that none of the current softirq routines use the data element passed to them." and the situation hasn't changed since them. So it appears we can safely remove that extra argument to save 128 (54) bytes of kernel data (text). Signed-off-by: Carlos R. Mafra Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- block/blk-core.c | 2 +- include/linux/interrupt.h | 3 +-- kernel/hrtimer.c | 2 +- kernel/rcuclassic.c | 2 +- kernel/rcupreempt.c | 2 +- kernel/sched.c | 2 +- kernel/softirq.c | 7 +++---- kernel/timer.c | 2 +- net/core/dev.c | 4 ++-- 9 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 6a9cc0d22a6..75fdc65136e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2048,7 +2048,7 @@ int __init blk_dev_init(void) for_each_possible_cpu(i) INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); - open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); register_hotcpu_notifier(&blk_cpu_notifier); return 0; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f1fc7470d26..a86186dd047 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -285,12 +285,11 @@ enum struct softirq_action { void (*action)(struct softirq_action *); - void *data; }; asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); -extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data); +extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); #define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) extern void raise_softirq_irqoff(unsigned int nr); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 421be5fe5cc..861b4088092 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1669,7 +1669,7 @@ void __init hrtimers_init(void) (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); #ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); #endif } diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index f4ffbd0f306..f6e01f3ae9c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -529,7 +529,7 @@ static void __cpuinit rcu_online_cpu(int cpu) rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } static int __cpuinit rcu_cpu_notify(struct notifier_block *self, diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e1cdf196a51..9dd827db359 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1125,7 +1125,7 @@ void __init __rcu_init(void) for_each_online_cpu(cpu) rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } /* diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a9153..56ea3a203a5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8154,7 +8154,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #endif #ifdef CONFIG_RT_MUTEXES diff --git a/kernel/softirq.c b/kernel/softirq.c index 36e06174004..059256874e9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -347,9 +347,8 @@ void raise_softirq(unsigned int nr) local_irq_restore(flags); } -void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) +void open_softirq(int nr, void (*action)(struct softirq_action *)) { - softirq_vec[nr].data = data; softirq_vec[nr].action = action; } @@ -503,8 +502,8 @@ void __init softirq_init(void) &per_cpu(tasklet_hi_vec, cpu).head; } - open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); - open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); + open_softirq(TASKLET_SOFTIRQ, tasklet_action); + open_softirq(HI_SOFTIRQ, tasklet_hi_action); } static int ksoftirqd(void * __bind_cpu) diff --git a/kernel/timer.c b/kernel/timer.c index ceacc662657..b4da888497f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1502,7 +1502,7 @@ void __init init_timers(void) BUG_ON(err == NOTIFY_BAD); register_cpu_notifier(&timers_nb); - open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); + open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } /** diff --git a/net/core/dev.c b/net/core/dev.c index 58296307787..cf0e16731dc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4563,8 +4563,8 @@ static int __init net_dev_init(void) dev_boot_phase = 0; - open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); - open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); + open_softirq(NET_TX_SOFTIRQ, net_tx_action); + open_softirq(NET_RX_SOFTIRQ, net_rx_action); hotcpu_notifier(dev_cpu_callback, 0); dst_init(); -- cgit v1.2.3-70-g09d2 From e9197bf0114661195bee35e7795cfc42164d9b2c Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 14 May 2008 08:15:10 -0700 Subject: x86 boot: remove some unused extern function declarations Remove three extern declarations for routines that don't exist. Fix a typo in a comment. Signed-off-by: Paul Jackson Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 2 +- include/linux/efi.h | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c5066d519e5..afb07ffb931 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -233,7 +233,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, else bootmap_start = round_up(start, PAGE_SIZE); /* - * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like + * SMP_CACHE_BYTES could be enough, but init_bootmem_node like * to use that to align to PAGE_SIZE */ bootmap = early_node_mem(nodeid, bootmap_start, end, diff --git a/include/linux/efi.h b/include/linux/efi.h index a5f359a7ad0..807373d467f 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -287,7 +287,6 @@ efi_guid_unparse(efi_guid_t *guid, char *out) extern void efi_init (void); extern void *efi_get_pal_addr (void); extern void efi_map_pal_code (void); -extern void efi_map_memmap(void); extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg); extern void efi_gettimeofday (struct timespec *ts); extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if possible */ @@ -295,14 +294,11 @@ extern u64 efi_get_iobase (void); extern u32 efi_mem_type (unsigned long phys_addr); extern u64 efi_mem_attributes (unsigned long phys_addr); extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); -extern int efi_mem_attribute_range (unsigned long phys_addr, unsigned long size, - u64 attr); extern int __init efi_uart_console_only (void); extern void efi_initialize_iomem_resources(struct resource *code_resource, struct resource *data_resource, struct resource *bss_resource); extern unsigned long efi_get_time(void); extern int efi_set_rtc_mmss(unsigned long nowtime); -extern int is_available_memory(efi_memory_desc_t * md); extern struct efi_memory_map memmap; /** -- cgit v1.2.3-70-g09d2 From c801ed3860fe2f84281d4cae4c3e6ca87e81e890 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 14 May 2008 08:15:23 -0700 Subject: x86 boot: simplify pageblock_bits enum declaration The use of #defines with '##' pre-processor concatenation is a useful way to form several symbol names with a common pattern. But when there is just a single name obtained from that #define, it's just obfuscation. Better to just write the plain symbol name, as is. The following patch is a result of my wasting ten minutes looking through the kernel to figure out what 'PB_migrate_end' meant, and forgetting what I came to do, by the time I figured out that the #define PB_range macro defined it. Signed-off-by: Paul Jackson Signed-off-by: Ingo Molnar --- include/linux/pageblock-flags.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e875905f7b1..e8c06122be3 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -25,13 +25,11 @@ #include -/* Macro to aid the definition of ranges of bits */ -#define PB_range(name, required_bits) \ - name, name ## _end = (name + required_bits) - 1 - /* Bit indices that affect a whole block of pages */ enum pageblock_bits { - PB_range(PB_migrate, 3), /* 3 bits required for migrate types */ + PB_migrate, + PB_migrate_end = PB_migrate + 3 - 1, + /* 3 bits required for migrate types */ NR_PAGEBLOCK_BITS }; -- cgit v1.2.3-70-g09d2 From 41c52c0db9607e59f90da7da5309489fa06e887f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 May 2008 11:46:33 -0400 Subject: ftrace: set_ftrace_notrace feature While debugging latencies in the RT kernel, I found that it would be nice to be able to filter away functions from the trace than just to filter on functions. I added a new interface to the debugfs tracing directory called set_ftrace_notrace When dynamic frace is enabled, this lets you filter away functions that will not be recorded in the trace. It is similar to adding 'notrace' to those functions but by doing it without recompiling the kernel. Here's how set_ftrace_filter and set_ftrace_notrace interact. Remember, if set_ftrace_filter is set, it removes all functions from the trace execpt for those listed in the set_ftrace_filter. set_ftrace_notrace will prevent those functions from being traced. If you were to set one function in both set_ftrace_filter and set_ftrace_notrace and that function was the same, then you would end up with an empty trace. the set of functions to trace is: set_ftrace_filter == empty then all functions not in set_ftrace_notrace else set of the set_ftrace_filter and not in set of set_ftrace_notrace. Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner --- include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 170 +++++++++++++++++++++++++++++++++++++------------ 2 files changed, 131 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 922e23d0196..ffbbd54a720 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -48,6 +48,7 @@ enum { FTRACE_FL_FAILED = (1 << 1), FTRACE_FL_FILTER = (1 << 2), FTRACE_FL_ENABLED = (1 << 3), + FTRACE_FL_NOTRACE = (1 << 4), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 89bd9a6f52e..2552454609c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -170,7 +170,7 @@ static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); static DEFINE_SPINLOCK(ftrace_shutdown_lock); static DEFINE_MUTEX(ftraced_lock); -static DEFINE_MUTEX(ftrace_filter_lock); +static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { struct ftrace_page *next; @@ -337,13 +337,12 @@ static void __ftrace_replace_code(struct dyn_ftrace *rec, unsigned char *old, unsigned char *new, int enable) { - unsigned long ip; + unsigned long ip, fl; int failed; ip = rec->ip; if (ftrace_filtered && enable) { - unsigned long fl; /* * If filtering is on: * @@ -356,13 +355,16 @@ __ftrace_replace_code(struct dyn_ftrace *rec, * If this record is not set to be filtered * and it is not enabled do nothing. * + * If this record is set not to trace then + * do nothing. + * * If this record is not set to be filtered and * it is enabled, disable it. */ fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || - (fl == 0)) + (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE)) return; /* @@ -380,9 +382,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec, } } else { - if (enable) + if (enable) { + /* + * If this record is set not to trace and is + * not enabled, do nothing. + */ + fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); + if (fl == FTRACE_FL_NOTRACE) + return; + new = ftrace_call_replace(ip, FTRACE_ADDR); - else + } else old = ftrace_call_replace(ip, FTRACE_ADDR); if (enable) { @@ -721,6 +731,7 @@ static int __init ftrace_dyn_table_alloc(void) enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_CONT = (1 << 1), + FTRACE_ITER_NOTRACE = (1 << 2), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -754,7 +765,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) rec = &iter->pg->records[iter->idx++]; if ((rec->flags & FTRACE_FL_FAILED) || ((iter->flags & FTRACE_ITER_FILTER) && - !(rec->flags & FTRACE_FL_FILTER))) { + !(rec->flags & FTRACE_FL_FILTER)) || + ((iter->flags & FTRACE_ITER_NOTRACE) && + !(rec->flags & FTRACE_FL_NOTRACE))) { rec = NULL; goto retry; } @@ -847,22 +860,24 @@ int ftrace_avail_release(struct inode *inode, struct file *file) return 0; } -static void ftrace_filter_reset(void) +static void ftrace_filter_reset(int enable) { struct ftrace_page *pg; struct dyn_ftrace *rec; + unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i; /* keep kstop machine from running */ preempt_disable(); - ftrace_filtered = 0; + if (enable) + ftrace_filtered = 0; pg = ftrace_pages_start; while (pg) { for (i = 0; i < pg->index; i++) { rec = &pg->records[i]; if (rec->flags & FTRACE_FL_FAILED) continue; - rec->flags &= ~FTRACE_FL_FILTER; + rec->flags &= ~type; } pg = pg->next; } @@ -870,7 +885,7 @@ static void ftrace_filter_reset(void) } static int -ftrace_filter_open(struct inode *inode, struct file *file) +ftrace_regex_open(struct inode *inode, struct file *file, int enable) { struct ftrace_iterator *iter; int ret = 0; @@ -882,15 +897,16 @@ ftrace_filter_open(struct inode *inode, struct file *file) if (!iter) return -ENOMEM; - mutex_lock(&ftrace_filter_lock); + mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && !(file->f_flags & O_APPEND)) - ftrace_filter_reset(); + ftrace_filter_reset(enable); if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; iter->pos = -1; - iter->flags = FTRACE_ITER_FILTER; + iter->flags = enable ? FTRACE_ITER_FILTER : + FTRACE_ITER_NOTRACE; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -900,13 +916,25 @@ ftrace_filter_open(struct inode *inode, struct file *file) kfree(iter); } else file->private_data = iter; - mutex_unlock(&ftrace_filter_lock); + mutex_unlock(&ftrace_regex_lock); return ret; } +static int +ftrace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 1); +} + +static int +ftrace_notrace_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(inode, file, 0); +} + static ssize_t -ftrace_filter_read(struct file *file, char __user *ubuf, +ftrace_regex_read(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos) { if (file->f_mode & FMODE_READ) @@ -916,7 +944,7 @@ ftrace_filter_read(struct file *file, char __user *ubuf, } static loff_t -ftrace_filter_lseek(struct file *file, loff_t offset, int origin) +ftrace_regex_lseek(struct file *file, loff_t offset, int origin) { loff_t ret; @@ -936,13 +964,14 @@ enum { }; static void -ftrace_match(unsigned char *buff, int len) +ftrace_match(unsigned char *buff, int len, int enable) { char str[KSYM_SYMBOL_LEN]; char *search = NULL; struct ftrace_page *pg; struct dyn_ftrace *rec; int type = MATCH_FULL; + unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i, match = 0, search_len = 0; for (i = 0; i < len; i++) { @@ -966,7 +995,8 @@ ftrace_match(unsigned char *buff, int len) /* keep kstop machine from running */ preempt_disable(); - ftrace_filtered = 1; + if (enable) + ftrace_filtered = 1; pg = ftrace_pages_start; while (pg) { for (i = 0; i < pg->index; i++) { @@ -997,7 +1027,7 @@ ftrace_match(unsigned char *buff, int len) break; } if (matched) - rec->flags |= FTRACE_FL_FILTER; + rec->flags |= flag; } pg = pg->next; } @@ -1005,8 +1035,8 @@ ftrace_match(unsigned char *buff, int len) } static ssize_t -ftrace_filter_write(struct file *file, const char __user *ubuf, - size_t cnt, loff_t *ppos) +ftrace_regex_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos, int enable) { struct ftrace_iterator *iter; char ch; @@ -1016,7 +1046,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, if (!cnt || cnt < 0) return 0; - mutex_lock(&ftrace_filter_lock); + mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; @@ -1045,7 +1075,6 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, cnt--; } - if (isspace(ch)) { file->f_pos += read; ret = read; @@ -1072,7 +1101,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, if (isspace(ch)) { iter->filtered++; iter->buffer[iter->buffer_idx] = 0; - ftrace_match(iter->buffer, iter->buffer_idx); + ftrace_match(iter->buffer, iter->buffer_idx, enable); iter->buffer_idx = 0; } else iter->flags |= FTRACE_ITER_CONT; @@ -1082,11 +1111,39 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, ret = read; out: - mutex_unlock(&ftrace_filter_lock); + mutex_unlock(&ftrace_regex_lock); return ret; } +static ssize_t +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 1); +} + +static ssize_t +ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 0); +} + +static void +ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) +{ + if (unlikely(ftrace_disabled)) + return; + + mutex_lock(&ftrace_regex_lock); + if (reset) + ftrace_filter_reset(enable); + if (buf) + ftrace_match(buf, len, enable); + mutex_unlock(&ftrace_regex_lock); +} + /** * ftrace_set_filter - set a function to filter on in ftrace * @buf - the string that holds the function filter text. @@ -1098,24 +1155,31 @@ ftrace_filter_write(struct file *file, const char __user *ubuf, */ void ftrace_set_filter(unsigned char *buf, int len, int reset) { - if (unlikely(ftrace_disabled)) - return; + ftrace_set_regex(buf, len, reset, 1); +} - mutex_lock(&ftrace_filter_lock); - if (reset) - ftrace_filter_reset(); - if (buf) - ftrace_match(buf, len); - mutex_unlock(&ftrace_filter_lock); +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_notrace(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(buf, len, reset, 0); } static int -ftrace_filter_release(struct inode *inode, struct file *file) +ftrace_regex_release(struct inode *inode, struct file *file, int enable) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; - mutex_lock(&ftrace_filter_lock); + mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { iter = m->private; @@ -1126,7 +1190,7 @@ ftrace_filter_release(struct inode *inode, struct file *file) if (iter->buffer_idx) { iter->filtered++; iter->buffer[iter->buffer_idx] = 0; - ftrace_match(iter->buffer, iter->buffer_idx); + ftrace_match(iter->buffer, iter->buffer_idx, enable); } mutex_lock(&ftrace_sysctl_lock); @@ -1137,10 +1201,22 @@ ftrace_filter_release(struct inode *inode, struct file *file) mutex_unlock(&ftrace_sysctl_lock); kfree(iter); - mutex_unlock(&ftrace_filter_lock); + mutex_unlock(&ftrace_regex_lock); return 0; } +static int +ftrace_filter_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 1); +} + +static int +ftrace_notrace_release(struct inode *inode, struct file *file) +{ + return ftrace_regex_release(inode, file, 0); +} + static struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, @@ -1150,12 +1226,20 @@ static struct file_operations ftrace_avail_fops = { static struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, - .read = ftrace_filter_read, + .read = ftrace_regex_read, .write = ftrace_filter_write, - .llseek = ftrace_filter_lseek, + .llseek = ftrace_regex_lseek, .release = ftrace_filter_release, }; +static struct file_operations ftrace_notrace_fops = { + .open = ftrace_notrace_open, + .read = ftrace_regex_read, + .write = ftrace_notrace_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_notrace_release, +}; + /** * ftrace_force_update - force an update to all recording ftrace functions * @@ -1239,6 +1323,12 @@ static __init int ftrace_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'set_ftrace_filter' entry\n"); + + entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + NULL, &ftrace_notrace_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_ftrace_notrace' entry\n"); return 0; } -- cgit v1.2.3-70-g09d2 From 9e124fe16ff24746d6de5a2ad685266d7bce0e08 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Mon, 26 May 2008 23:31:07 +0100 Subject: xen: Enable console tty by default in domU if it's not a dummy Without console= arguments on the kernel command line, the first console to register becomes enabled and the preferred console (the one behind /dev/console). This is normally tty (assuming CONFIG_VT_CONSOLE is enabled, which it commonly is). This is okay as long tty is a useful console. But unless we have the PV framebuffer, and it is enabled for this domain, tty0 in domU is merely a dummy. In that case, we want the preferred console to be the Xen console hvc0, and we want it without having to fiddle with the kernel command line. Commit b8c2d3dfbc117dff26058fbac316b8acfc2cb5f7 did that for us. Since we now have the PV framebuffer, we want to enable and prefer tty again, but only when PVFB is enabled. But even then we still want to enable the Xen console as well. Problem: when tty registers, we can't yet know whether the PVFB is enabled. By the time we can know (xenstore is up), the console setup game is over. Solution: enable console tty by default, but keep hvc as the preferred console. Change the preferred console to tty when PVFB probes successfully, unless we've been given console kernel parameters. Signed-off-by: Markus Armbruster Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/enlighten.c | 4 +++- drivers/video/xen-fbfront.c | 25 +++++++++++++++++++++++++ include/linux/console.h | 2 ++ kernel/printk.c | 3 +++ 4 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1b4b5fa498b..6cfb708408e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1256,8 +1256,10 @@ asmlinkage void __init xen_start_kernel(void) ? __pa(xen_start_info->mod_start) : 0; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; - if (!is_initial_xendomain()) + if (!is_initial_xendomain()) { + add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + } /* Start the world */ start_kernel(); diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c index 619a6f8d65a..4e10876e62f 100644 --- a/drivers/video/xen-fbfront.c +++ b/drivers/video/xen-fbfront.c @@ -18,6 +18,7 @@ * frame buffer. */ +#include #include #include #include @@ -48,6 +49,7 @@ struct xenfb_info { static u32 xenfb_mem_len = XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8; +static void xenfb_make_preferred_console(void); static int xenfb_remove(struct xenbus_device *); static void xenfb_init_shared_page(struct xenfb_info *); static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *); @@ -348,6 +350,7 @@ static int __devinit xenfb_probe(struct xenbus_device *dev, if (ret < 0) goto error; + xenfb_make_preferred_console(); return 0; error_nomem: @@ -358,6 +361,28 @@ static int __devinit xenfb_probe(struct xenbus_device *dev, return ret; } +static __devinit void +xenfb_make_preferred_console(void) +{ + struct console *c; + + if (console_set_on_cmdline) + return; + + acquire_console_sem(); + for (c = console_drivers; c; c = c->next) { + if (!strcmp(c->name, "tty") && c->index == 0) + break; + } + release_console_sem(); + if (c) { + unregister_console(c); + c->flags |= CON_CONSDEV; + c->flags &= ~CON_PRINTBUFFER; /* don't print again */ + register_console(c); + } +} + static int xenfb_resume(struct xenbus_device *dev) { struct xenfb_info *info = dev->dev.driver_data; diff --git a/include/linux/console.h b/include/linux/console.h index a4f27fbdf54..248e6e3b9b7 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -108,6 +108,8 @@ struct console { struct console *next; }; +extern int console_set_on_cmdline; + extern int add_preferred_console(char *name, int idx, char *options); extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options); extern void register_console(struct console *); diff --git a/kernel/printk.c b/kernel/printk.c index 8fb01c32aa3..028ed75d486 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -121,6 +121,8 @@ struct console_cmdline static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int selected_console = -1; static int preferred_console = -1; +int console_set_on_cmdline; +EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -890,6 +892,7 @@ static int __init console_setup(char *str) *s = 0; __add_preferred_console(buf, idx, options, brl_options); + console_set_on_cmdline = 1; return 1; } __setup("console=", console_setup); -- cgit v1.2.3-70-g09d2 From 0e91398f2a5d4eb6b07df8115917d0d1cf3e9b58 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 May 2008 23:31:27 +0100 Subject: xen: implement save/restore This patch implements Xen save/restore and migration. Saving is triggered via xenbus, which is polled in drivers/xen/manage.c. When a suspend request comes in, the kernel prepares itself for saving by: 1 - Freeze all processes. This is primarily to prevent any partially-completed pagetable updates from confusing the suspend process. If CONFIG_PREEMPT isn't defined, then this isn't necessary. 2 - Suspend xenbus and other devices 3 - Stop_machine, to make sure all the other vcpus are quiescent. The Xen tools require the domain to run its save off vcpu0. 4 - Within the stop_machine state, it pins any unpinned pgds (under construction or destruction), performs canonicalizes various other pieces of state (mostly converting mfns to pfns), and finally 5 - Suspend the domain Restore reverses the steps used to save the domain, ending when all the frozen processes are thawed. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/enlighten.c | 6 +-- arch/x86/xen/mmu.c | 46 +++++++++++++++++ arch/x86/xen/smp.c | 2 +- arch/x86/xen/suspend.c | 42 +++++++++++++++ arch/x86/xen/time.c | 8 +++ arch/x86/xen/xen-ops.h | 4 ++ drivers/xen/events.c | 83 +++++++++++++++++++++++++++++ drivers/xen/grant-table.c | 4 +- drivers/xen/manage.c | 126 ++++++++++++++++++++++++++++++++++++++++----- include/linux/page-flags.h | 1 + include/xen/events.h | 3 ++ include/xen/grant_table.h | 3 ++ include/xen/xen-ops.h | 9 ++++ 14 files changed, 318 insertions(+), 21 deletions(-) create mode 100644 arch/x86/xen/suspend.c (limited to 'include/linux') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 40b119b6b10..2ba2d164913 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o \ - time.o xen-asm.o grant-table.o + time.o xen-asm.o grant-table.o suspend.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ce67dc8f36a..b94f63ac228 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -857,7 +857,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) PFN_DOWN(__pa(xen_start_info->pt_base))); } -static __init void setup_shared_info(void) +void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); @@ -894,7 +894,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) pv_mmu_ops.release_pmd = xen_release_pmd; pv_mmu_ops.set_pte = xen_set_pte; - setup_shared_info(); + xen_setup_shared_info(); /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ @@ -902,7 +902,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) } /* This is called once we have the cpu_possible_map */ -void __init xen_setup_vcpu_info_placement(void) +void xen_setup_vcpu_info_placement(void) { int cpu; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4740cda3656..e95955968ba 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -560,6 +560,29 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_issue(0); } +/* + * On save, we need to pin all pagetables to make sure they get their + * mfns turned into pfns. Search the list for any unpinned pgds and pin + * them (unpinned pgds are not currently in use, probably because the + * process is under construction or destruction). + */ +void xen_mm_pin_all(void) +{ + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + + list_for_each_entry(page, &pgd_list, lru) { + if (!PagePinned(page)) { + xen_pgd_pin((pgd_t *)page_address(page)); + SetPageSavePinned(page); + } + } + + spin_unlock_irqrestore(&pgd_lock, flags); +} + /* The init_mm pagetable is really pinned as soon as its created, but that's before we have page structures to store the bits. So do all the book-keeping now. */ @@ -617,6 +640,29 @@ static void xen_pgd_unpin(pgd_t *pgd) xen_mc_issue(0); } +/* + * On resume, undo any pinning done at save, so that the rest of the + * kernel doesn't see any unexpected pinned pagetables. + */ +void xen_mm_unpin_all(void) +{ + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + + list_for_each_entry(page, &pgd_list, lru) { + if (PageSavePinned(page)) { + BUG_ON(!PagePinned(page)); + printk("unpinning pinned %p\n", page_address(page)); + xen_pgd_unpin((pgd_t *)page_address(page)); + ClearPageSavePinned(page); + } + } + + spin_unlock_irqrestore(&pgd_lock, flags); +} + void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { spin_lock(&next->page_table_lock); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 74ab8968c52..d2e3c20127d 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -35,7 +35,7 @@ #include "xen-ops.h" #include "mmu.h" -static cpumask_t xen_cpu_initialized_map; +cpumask_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq) = -1; static DEFINE_PER_CPU(int, callfunc_irq) = -1; static DEFINE_PER_CPU(int, debug_irq) = -1; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c new file mode 100644 index 00000000000..7620a16fe53 --- /dev/null +++ b/arch/x86/xen/suspend.c @@ -0,0 +1,42 @@ +#include + +#include +#include +#include + +#include +#include + +#include "xen-ops.h" +#include "mmu.h" + +void xen_pre_suspend(void) +{ + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + mfn_to_pfn(xen_start_info->console.domU.mfn); + + BUG_ON(!irqs_disabled()); + + HYPERVISOR_shared_info = &xen_dummy_shared_info; + if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + __pte_ma(0), 0)) + BUG(); +} + +void xen_post_suspend(int suspend_cancelled) +{ + if (suspend_cancelled) { + xen_start_info->store_mfn = + pfn_to_mfn(xen_start_info->store_mfn); + xen_start_info->console.domU.mfn = + pfn_to_mfn(xen_start_info->console.domU.mfn); + } else { +#ifdef CONFIG_SMP + xen_cpu_initialized_map = cpu_online_map; +#endif + } + + xen_setup_shared_info(); +} + diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c39e1a5aa24..0bef256e5f2 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -572,6 +572,14 @@ void xen_setup_cpu_clockevents(void) clockevents_register_device(&__get_cpu_var(xen_clock_events)); } +void xen_time_suspend(void) +{ +} + +void xen_time_resume(void) +{ +} + __init void xen_time_init(void) { int cpu = smp_processor_id(); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a1bc89a8f16..a0503acad66 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -9,6 +9,7 @@ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +struct trap_info; void xen_copy_trap_info(struct trap_info *traps); DECLARE_PER_CPU(unsigned long, xen_cr3); @@ -19,6 +20,7 @@ extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); +void xen_setup_shared_info(void); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); @@ -59,6 +61,8 @@ int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, int wait); +extern cpumask_t xen_cpu_initialized_map; + /* Declare an asm function, along with symbols needed to make it inlineable */ diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 70375a69076..73d78dc9b87 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -674,6 +674,89 @@ static int retrigger_dynirq(unsigned int irq) return ret; } +static void restore_cpu_virqs(unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int virq, irq, evtchn; + + for (virq = 0; virq < NR_VIRQS; virq++) { + if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) + continue; + + BUG_ON(irq_info[irq].type != IRQT_VIRQ); + BUG_ON(irq_info[irq].index != virq); + + /* Get a new binding from Xen. */ + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + /* Record the new mapping. */ + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + bind_evtchn_to_cpu(evtchn, cpu); + + /* Ready for use. */ + unmask_evtchn(evtchn); + } +} + +static void restore_cpu_ipis(unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + int ipi, irq, evtchn; + + for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) { + if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) + continue; + + BUG_ON(irq_info[irq].type != IRQT_IPI); + BUG_ON(irq_info[irq].index != ipi); + + /* Get a new binding from Xen. */ + bind_ipi.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + /* Record the new mapping. */ + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); + bind_evtchn_to_cpu(evtchn, cpu); + + /* Ready for use. */ + unmask_evtchn(evtchn); + + } +} + +void xen_irq_resume(void) +{ + unsigned int cpu, irq, evtchn; + + init_evtchn_cpu_bindings(); + + /* New event-channel space is not 'live' yet. */ + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) + mask_evtchn(evtchn); + + /* No IRQ <-> event-channel mappings. */ + for (irq = 0; irq < NR_IRQS; irq++) + irq_info[irq].evtchn = 0; /* zap event-channel binding */ + + for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) + evtchn_to_irq[evtchn] = -1; + + for_each_possible_cpu(cpu) { + restore_cpu_virqs(cpu); + restore_cpu_ipis(cpu); + } +} + static struct irq_chip xen_dynamic_chip __read_mostly = { .name = "xen-dyn", .mask = disable_dynirq, diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 52b6b41b909..e9e11168616 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -471,14 +471,14 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) return 0; } -static int gnttab_resume(void) +int gnttab_resume(void) { if (max_nr_grant_frames() < nr_grant_frames) return -ENOSYS; return gnttab_map(0, nr_grant_frames - 1); } -static int gnttab_suspend(void) +int gnttab_suspend(void) { arch_gnttab_unmap_shared(shared, nr_grant_frames); return 0; diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index aa7af9e6abc..ba85fa2cff3 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -5,21 +5,113 @@ #include #include #include +#include +#include #include - -#define SHUTDOWN_INVALID -1 -#define SHUTDOWN_POWEROFF 0 -#define SHUTDOWN_SUSPEND 2 -/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only - * report a crash, not be instructed to crash! - * HALT is the same as POWEROFF, as far as we're concerned. The tools use - * the distinction when we return the reason code to them. - */ -#define SHUTDOWN_HALT 4 +#include +#include +#include +#include + +#include +#include + +enum shutdown_state { + SHUTDOWN_INVALID = -1, + SHUTDOWN_POWEROFF = 0, + SHUTDOWN_SUSPEND = 2, + /* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only + report a crash, not be instructed to crash! + HALT is the same as POWEROFF, as far as we're concerned. The tools use + the distinction when we return the reason code to them. */ + SHUTDOWN_HALT = 4, +}; /* Ignore multiple shutdown requests. */ -static int shutting_down = SHUTDOWN_INVALID; +static enum shutdown_state shutting_down = SHUTDOWN_INVALID; + +static int xen_suspend(void *data) +{ + int *cancelled = data; + + BUG_ON(!irqs_disabled()); + + load_cr3(swapper_pg_dir); + + xen_mm_pin_all(); + gnttab_suspend(); + xen_time_suspend(); + xen_pre_suspend(); + + /* + * This hypercall returns 1 if suspend was cancelled + * or the domain was merely checkpointed, and 0 if it + * is resuming in a new domain. + */ + *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); + + xen_post_suspend(*cancelled); + xen_time_resume(); + gnttab_resume(); + xen_mm_unpin_all(); + + if (!*cancelled) { + xen_irq_resume(); + xen_console_resume(); + } + + return 0; +} + +static void do_suspend(void) +{ + int err; + int cancelled = 1; + + shutting_down = SHUTDOWN_SUSPEND; + +#ifdef CONFIG_PREEMPT + /* If the kernel is preemptible, we need to freeze all the processes + to prevent them from being in the middle of a pagetable update + during suspend. */ + err = freeze_processes(); + if (err) { + printk(KERN_ERR "xen suspend: freeze failed %d\n", err); + return; + } +#endif + + err = device_suspend(PMSG_SUSPEND); + if (err) { + printk(KERN_ERR "xen suspend: device_suspend %d\n", err); + goto out; + } + + printk("suspending xenbus...\n"); + /* XXX use normal device tree? */ + xenbus_suspend(); + + err = stop_machine_run(xen_suspend, &cancelled, 0); + if (err) { + printk(KERN_ERR "failed to start xen_suspend: %d\n", err); + goto out; + } + + if (!cancelled) + xenbus_resume(); + else + xenbus_suspend_cancel(); + + device_resume(); + + +out: +#ifdef CONFIG_PREEMPT + thaw_processes(); +#endif + shutting_down = SHUTDOWN_INVALID; +} static void shutdown_handler(struct xenbus_watch *watch, const char **vec, unsigned int len) @@ -52,11 +144,17 @@ static void shutdown_handler(struct xenbus_watch *watch, } if (strcmp(str, "poweroff") == 0 || - strcmp(str, "halt") == 0) + strcmp(str, "halt") == 0) { + shutting_down = SHUTDOWN_POWEROFF; orderly_poweroff(false); - else if (strcmp(str, "reboot") == 0) + } else if (strcmp(str, "reboot") == 0) { + shutting_down = SHUTDOWN_POWEROFF; /* ? */ ctrl_alt_del(); - else { +#ifdef CONFIG_PM_SLEEP + } else if (strcmp(str, "suspend") == 0) { + do_suspend(); +#endif + } else { printk(KERN_INFO "Ignoring shutdown request: %s\n", str); shutting_down = SHUTDOWN_INVALID; } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 590cff32415..02955a1c3d7 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -157,6 +157,7 @@ PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) __PAGEFLAG(Slab, slab) PAGEFLAG(Checked, owner_priv_1) /* Used by some filesystems */ PAGEFLAG(Pinned, owner_priv_1) TESTSCFLAG(Pinned, owner_priv_1) /* Xen */ +PAGEFLAG(SavePinned, dirty); /* Xen */ PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private) __SETPAGEFLAG(Private, private) diff --git a/include/xen/events.h b/include/xen/events.h index a82ec0c45c3..67c4436554a 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -41,4 +41,7 @@ static inline void notify_remote_via_evtchn(int port) } extern void notify_remote_via_irq(int irq); + +extern void xen_irq_resume(void); + #endif /* _XEN_EVENTS_H */ diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 46620484612..a40f1cd91be 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -51,6 +51,9 @@ struct gnttab_free_callback { u16 count; }; +int gnttab_suspend(void); +int gnttab_resume(void); + int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly); diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 10ddfe0142d..5d7a6db54a8 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -5,4 +5,13 @@ DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); +void xen_pre_suspend(void); +void xen_post_suspend(int suspend_cancelled); + +void xen_mm_pin_all(void); +void xen_mm_unpin_all(void); + +void xen_time_suspend(void); +void xen_time_resume(void); + #endif /* INCLUDE_XEN_OPS_H */ -- cgit v1.2.3-70-g09d2 From b1829d2705daa7cb72eb1e08bdc8b7e9fad34266 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2008 01:22:08 +0200 Subject: ftrace: fix merge Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ffbbd54a720..b482fe88bc0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -122,7 +122,7 @@ static inline void tracer_disable(void) # define trace_preempt_off(a0, a1) do { } while (0) #endif -#ifdef CONFIG_CONTEXT_SWITCH_TRACER +#ifdef CONFIG_TRACING extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); #else -- cgit v1.2.3-70-g09d2 From ad90c0e3ce8d20d6873b57e36181ef6d7a0097fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 May 2008 20:48:37 -0400 Subject: ftrace: user update and disable dynamic ftrace daemon In dynamic ftrace, the mcount function starts off pointing to a stub function that just returns. On start up, the call to the stub is modified to point to a "record_ip" function. The job of the record_ip function is to add the function to a pre-allocated hash list. If the function is already there, it simply is ignored, otherwise it is added to the list. Later, a ftraced daemon wakes up and calls kstop_machine if any functions have been recorded, and changes the calls to the recorded functions to a simple nop. If no functions were recorded, the daemon goes back to sleep. The daemon wakes up once a second to see if it needs to update any newly recorded functions into nops. Usually it does not, but if a lot of code has been executed for the first time in the kernel, the ftraced daemon will call kstop_machine to update those into nops. The problem currently is that there's no way to stop the daemon from doing this, and it can cause unneeded latencies (800us which for some is bothersome). This patch adds a new file /debugfs/tracing/ftraced_enabled. If the daemon is active, reading this will return "enabled\n" and "disabled\n" when the daemon is not running. To disable the daemon, the user can echo "0" or "disable" into this file, and "1" or "enable" to re-enable the daemon. Since the daemon is used to convert the functions into nops to increase the performance of the system, I also added that anytime something is written into the ftraced_enabled file, kstop_machine will run if there are new functions that have been detected that need to be converted. This way the user can disable the daemon but still be able to control the conversion of the mcount calls to nops by simply, "echo 0 > /debugfs/tracing/ftraced_enabled" when they need to do more conversions. To see the number of converted functions: "cat /debugfs/tracing/dyn_ftrace_total_info" Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 6 ++ kernel/trace/ftrace.c | 157 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 116 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b482fe88bc0..623819433ed 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -72,9 +72,15 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); + +void ftrace_disable_daemon(void); +void ftrace_enable_daemon(void); + #else # define ftrace_force_update() ({ 0; }) # define ftrace_set_filter(buf, len, reset) do { } while (0) +# define ftrace_disable_daemon() do { } while (0) +# define ftrace_enable_daemon() do { } while (0) #endif /* totally disable ftrace - can not re-enable after this */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1843edc098a..f762f5a2d33 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -151,8 +151,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE static struct task_struct *ftraced_task; -static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters); -static unsigned long ftraced_iteration_counter; enum { FTRACE_ENABLE_CALLS = (1 << 0), @@ -189,6 +187,7 @@ static struct ftrace_page *ftrace_pages; static int ftraced_trigger; static int ftraced_suspend; +static int ftraced_stop; static int ftrace_record_suspend; @@ -474,14 +473,21 @@ ftrace_code_disable(struct dyn_ftrace *rec) return 1; } +static int __ftrace_update_code(void *ignore); + static int __ftrace_modify_code(void *data) { unsigned long addr; int *command = data; - if (*command & FTRACE_ENABLE_CALLS) + if (*command & FTRACE_ENABLE_CALLS) { + /* + * Update any recorded ips now that we have the + * machine stopped + */ + __ftrace_update_code(NULL); ftrace_replace_code(1); - else if (*command & FTRACE_DISABLE_CALLS) + } else if (*command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); if (*command & FTRACE_UPDATE_TRACE_FUNC) @@ -503,6 +509,25 @@ static void ftrace_run_update_code(int command) stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); } +void ftrace_disable_daemon(void) +{ + /* Stop the daemon from calling kstop_machine */ + mutex_lock(&ftraced_lock); + ftraced_stop = 1; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + +void ftrace_enable_daemon(void) +{ + mutex_lock(&ftraced_lock); + ftraced_stop = 0; + mutex_unlock(&ftraced_lock); + + ftrace_force_update(); +} + static ftrace_func_t saved_ftrace_func; static void ftrace_startup(void) @@ -603,6 +628,7 @@ static int __ftrace_update_code(void *ignore) int i; /* Don't be recording funcs now */ + ftrace_record_suspend++; save_ftrace_enabled = ftrace_enabled; ftrace_enabled = 0; @@ -628,18 +654,23 @@ static int __ftrace_update_code(void *ignore) stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; ftrace_update_tot_cnt += ftrace_update_cnt; + ftraced_trigger = 0; ftrace_enabled = save_ftrace_enabled; + ftrace_record_suspend--; return 0; } -static void ftrace_update_code(void) +static int ftrace_update_code(void) { - if (unlikely(ftrace_disabled)) - return; + if (unlikely(ftrace_disabled) || + !ftrace_enabled || !ftraced_trigger) + return 0; stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); + + return 1; } static int ftraced(void *ignore) @@ -658,14 +689,13 @@ static int ftraced(void *ignore) mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); - if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) { - ftrace_record_suspend++; - ftrace_update_code(); + if (!ftraced_suspend && !ftraced_stop && + ftrace_update_code()) { usecs = nsecs_to_usecs(ftrace_update_time); if (ftrace_update_tot_cnt > 100000) { ftrace_update_tot_cnt = 0; pr_info("hm, dftrace overflow: %lu change%s" - " (%lu total) in %lu usec%s\n", + " (%lu total) in %lu usec%s\n", ftrace_update_cnt, ftrace_update_cnt != 1 ? "s" : "", ftrace_update_tot_cnt, @@ -673,15 +703,10 @@ static int ftraced(void *ignore) ftrace_disabled = 1; WARN_ON_ONCE(1); } - ftraced_trigger = 0; - ftrace_record_suspend--; } - ftraced_iteration_counter++; mutex_unlock(&ftraced_lock); mutex_unlock(&ftrace_sysctl_lock); - wake_up_interruptible(&ftraced_waiters); - ftrace_shutdown_replenish(); } __set_current_state(TASK_RUNNING); @@ -1219,6 +1244,55 @@ ftrace_notrace_release(struct inode *inode, struct file *file) return ftrace_regex_release(inode, file, 0); } +static ssize_t +ftraced_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* don't worry about races */ + char *buf = ftraced_stop ? "disabled\n" : "enabled\n"; + int r = strlen(buf); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +ftraced_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + if (strncmp(buf, "enable", 6) == 0) + val = 1; + else if (strncmp(buf, "disable", 7) == 0) + val = 0; + else { + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + } + + if (val) + ftrace_enable_daemon(); + else + ftrace_disable_daemon(); + + filp->f_pos += cnt; + + return cnt; +} + static struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, @@ -1242,51 +1316,34 @@ static struct file_operations ftrace_notrace_fops = { .release = ftrace_notrace_release, }; +static struct file_operations ftraced_fops = { + .open = tracing_open_generic, + .read = ftraced_read, + .write = ftraced_write, +}; + /** * ftrace_force_update - force an update to all recording ftrace functions - * - * The ftrace dynamic update daemon only wakes up once a second. - * There may be cases where an update needs to be done immediately - * for tests or internal kernel tracing to begin. This function - * wakes the daemon to do an update and will not return until the - * update is complete. */ int ftrace_force_update(void) { - unsigned long last_counter; - DECLARE_WAITQUEUE(wait, current); int ret = 0; if (unlikely(ftrace_disabled)) return -ENODEV; + mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftraced_lock); - last_counter = ftraced_iteration_counter; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&ftraced_waiters, &wait); - if (unlikely(!ftraced_task)) { - ret = -ENODEV; - goto out; - } - - do { - mutex_unlock(&ftraced_lock); - wake_up_process(ftraced_task); - schedule(); - mutex_lock(&ftraced_lock); - if (signal_pending(current)) { - ret = -EINTR; - break; - } - set_current_state(TASK_INTERRUPTIBLE); - } while (last_counter == ftraced_iteration_counter); + /* + * If ftraced_trigger is not set, then there is nothing + * to update. + */ + if (ftraced_trigger && !ftrace_update_code()) + ret = -EBUSY; - out: mutex_unlock(&ftraced_lock); - remove_wait_queue(&ftraced_waiters, &wait); - set_current_state(TASK_RUNNING); + mutex_unlock(&ftrace_sysctl_lock); return ret; } @@ -1331,6 +1388,12 @@ static __init int ftrace_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'set_ftrace_notrace' entry\n"); + + entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer, + NULL, &ftraced_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ftraced_enabled' entry\n"); return 0; } -- cgit v1.2.3-70-g09d2 From 18404756765c713a0be4eb1082920c04822ce588 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 29 May 2008 11:02:52 -0700 Subject: genirq: Expose default irq affinity mask (take 3) Current IRQ affinity interface does not provide a way to set affinity for the IRQs that will be allocated/activated in the future. This patch creates /proc/irq/default_smp_affinity that lets users set default affinity mask for the newly allocated IRQs. Changing the default does not affect affinity masks for the currently active IRQs, they have to be changed explicitly. Updated based on Paul J's comments and added some more documentation. Signed-off-by: Max Krasnyansky Cc: pj@sgi.com Cc: a.p.zijlstra@chello.nl Cc: tglx@linutronix.de Cc: rdunlap@xenotime.net Cc: mingo@elte.hu Signed-off-by: Thomas Gleixner --- Documentation/IRQ-affinity.txt | 37 ++++++++++++++++++------ Documentation/filesystems/proc.txt | 29 ++++++++++++------- arch/alpha/kernel/irq.c | 5 ++-- include/linux/interrupt.h | 5 ++++ include/linux/irq.h | 9 ------ kernel/irq/manage.c | 28 ++++++++++++++++-- kernel/irq/proc.c | 59 +++++++++++++++++++++++++++++++++++--- 7 files changed, 134 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/Documentation/IRQ-affinity.txt b/Documentation/IRQ-affinity.txt index 938d7dd0549..b4a615b7840 100644 --- a/Documentation/IRQ-affinity.txt +++ b/Documentation/IRQ-affinity.txt @@ -1,17 +1,26 @@ +ChangeLog: + Started by Ingo Molnar + Update by Max Krasnyansky -SMP IRQ affinity, started by Ingo Molnar - +SMP IRQ affinity /proc/irq/IRQ#/smp_affinity specifies which target CPUs are permitted for a given IRQ source. It's a bitmask of allowed CPUs. It's not allowed to turn off all CPUs, and if an IRQ controller does not support IRQ affinity then the value will not change from the default 0xffffffff. +/proc/irq/default_smp_affinity specifies default affinity mask that applies +to all non-active IRQs. Once IRQ is allocated/activated its affinity bitmask +will be set to the default mask. It can then be changed as described above. +Default mask is 0xffffffff. + Here is an example of restricting IRQ44 (eth1) to CPU0-3 then restricting -the IRQ to CPU4-7 (this is an 8-CPU SMP box): +it to CPU4-7 (this is an 8-CPU SMP box): +[root@moon 44]# cd /proc/irq/44 [root@moon 44]# cat smp_affinity ffffffff + [root@moon 44]# echo 0f > smp_affinity [root@moon 44]# cat smp_affinity 0000000f @@ -21,17 +30,27 @@ PING hell (195.4.7.3): 56 data bytes --- hell ping statistics --- 6029 packets transmitted, 6027 packets received, 0% packet loss round-trip min/avg/max = 0.1/0.1/0.4 ms -[root@moon 44]# cat /proc/interrupts | grep 44: - 44: 0 1785 1785 1783 1783 1 -1 0 IO-APIC-level eth1 +[root@moon 44]# cat /proc/interrupts | grep 'CPU\|44:' + CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 + 44: 1068 1785 1785 1783 0 0 0 0 IO-APIC-level eth1 + +As can be seen from the line above IRQ44 was delivered only to the first four +processors (0-3). +Now lets restrict that IRQ to CPU(4-7). + [root@moon 44]# echo f0 > smp_affinity +[root@moon 44]# cat smp_affinity +000000f0 [root@moon 44]# ping -f h PING hell (195.4.7.3): 56 data bytes .. --- hell ping statistics --- 2779 packets transmitted, 2777 packets received, 0% packet loss round-trip min/avg/max = 0.1/0.5/585.4 ms -[root@moon 44]# cat /proc/interrupts | grep 44: - 44: 1068 1785 1785 1784 1784 1069 1070 1069 IO-APIC-level eth1 -[root@moon 44]# +[root@moon 44]# cat /proc/interrupts | 'CPU\|44:' + CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 + 44: 1068 1785 1785 1783 1784 1069 1070 1069 IO-APIC-level eth1 + +This time around IRQ44 was delivered only to the last four processors. +i.e counters for the CPU0-3 did not change. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index dbc3c6a3650..7f268f327d7 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -380,28 +380,35 @@ i386 and x86_64 platforms support the new IRQ vector displays. Of some interest is the introduction of the /proc/irq directory to 2.4. It could be used to set IRQ to CPU affinity, this means that you can "hook" an IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the -irq subdir is one subdir for each IRQ, and one file; prof_cpu_mask +irq subdir is one subdir for each IRQ, and two files; default_smp_affinity and +prof_cpu_mask. For example > ls /proc/irq/ 0 10 12 14 16 18 2 4 6 8 prof_cpu_mask - 1 11 13 15 17 19 3 5 7 9 + 1 11 13 15 17 19 3 5 7 9 default_smp_affinity > ls /proc/irq/0/ smp_affinity -The contents of the prof_cpu_mask file and each smp_affinity file for each IRQ -is the same by default: +smp_affinity is a bitmask, in which you can specify which CPUs can handle the +IRQ, you can set it by doing: - > cat /proc/irq/0/smp_affinity - ffffffff + > echo 1 > /proc/irq/10/smp_affinity + +This means that only the first CPU will handle the IRQ, but you can also echo +5 which means that only the first and fourth CPU can handle the IRQ. -It's a bitmask, in which you can specify which CPUs can handle the IRQ, you can -set it by doing: +The contents of each smp_affinity file is the same by default: + + > cat /proc/irq/0/smp_affinity + ffffffff - > echo 1 > /proc/irq/prof_cpu_mask +The default_smp_affinity mask applies to all non-active IRQs, which are the +IRQs which have not yet been allocated/activated, and hence which lack a +/proc/irq/[0-9]* directory. -This means that only the first CPU will handle the IRQ, but you can also echo 5 -which means that only the first and fourth CPU can handle the IRQ. +prof_cpu_mask specifies which CPUs are to be profiled by the system wide +profiler. Default value is ffffffff (all cpus). The way IRQs are routed is handled by the IO-APIC, and it's Round Robin between all the CPUs which are allowed to handle it. As usual the kernel has diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c index facf82a5499..c626a821cdc 100644 --- a/arch/alpha/kernel/irq.c +++ b/arch/alpha/kernel/irq.c @@ -42,8 +42,7 @@ void ack_bad_irq(unsigned int irq) #ifdef CONFIG_SMP static char irq_user_affinity[NR_IRQS]; -int -select_smp_affinity(unsigned int irq) +int irq_select_affinity(unsigned int irq) { static int last_cpu; int cpu = last_cpu + 1; @@ -51,7 +50,7 @@ select_smp_affinity(unsigned int irq) if (!irq_desc[irq].chip->set_affinity || irq_user_affinity[irq]) return 1; - while (!cpu_possible(cpu)) + while (!cpu_possible(cpu) || !cpu_isset(cpu, irq_default_affinity)) cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0); last_cpu = cpu; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f1fc7470d26..043400f3d45 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -104,8 +104,11 @@ extern void enable_irq(unsigned int irq); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) +extern cpumask_t irq_default_affinity; + extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask); extern int irq_can_set_affinity(unsigned int irq); +extern int irq_select_affinity(unsigned int irq); #else /* CONFIG_SMP */ @@ -119,6 +122,8 @@ static inline int irq_can_set_affinity(unsigned int irq) return 0; } +static inline int irq_select_affinity(unsigned int irq) { return 0; } + #endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */ #ifdef CONFIG_GENERIC_HARDIRQS diff --git a/include/linux/irq.h b/include/linux/irq.h index 552e0ec269c..8ccb462ea42 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -244,15 +244,6 @@ static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) } #endif -#ifdef CONFIG_AUTO_IRQ_AFFINITY -extern int select_smp_affinity(unsigned int irq); -#else -static inline int select_smp_affinity(unsigned int irq) -{ - return 1; -} -#endif - extern int no_irq_affinity; static inline int irq_balancing_disabled(unsigned int irq) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 46d6611a33b..469814e9b9e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -17,6 +17,8 @@ #ifdef CONFIG_SMP +cpumask_t irq_default_affinity = CPU_MASK_ALL; + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -95,6 +97,27 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) return 0; } +#ifndef CONFIG_AUTO_IRQ_AFFINITY +/* + * Generic version of the affinity autoselector. + */ +int irq_select_affinity(unsigned int irq) +{ + cpumask_t mask; + + if (!irq_can_set_affinity(irq)) + return 0; + + cpus_and(mask, cpu_online_map, irq_default_affinity); + + irq_desc[irq].affinity = mask; + irq_desc[irq].chip->set_affinity(irq, mask); + + set_balance_irq_affinity(irq, mask); + return 0; +} +#endif + #endif /** @@ -382,6 +405,9 @@ int setup_irq(unsigned int irq, struct irqaction *new) } else /* Undo nested disables: */ desc->depth = 1; + + /* Set default affinity mask once everything is setup */ + irq_select_affinity(irq); } /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; @@ -571,8 +597,6 @@ int request_irq(unsigned int irq, irq_handler_t handler, action->next = NULL; action->dev_id = dev_id; - select_smp_affinity(irq); - #ifdef CONFIG_DEBUG_SHIRQ if (irqflags & IRQF_SHARED) { /* diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index c2f2ccb0549..6c6d35d68ee 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -44,7 +44,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, unsigned long count, void *data) { unsigned int irq = (int)(long)data, full_count = count, err; - cpumask_t new_value, tmp; + cpumask_t new_value; if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) @@ -62,17 +62,51 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - cpus_and(tmp, new_value, cpu_online_map); - if (cpus_empty(tmp)) + if (!cpus_intersects(new_value, cpu_online_map)) /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return select_smp_affinity(irq) ? -EINVAL : full_count; + return irq_select_affinity(irq) ? -EINVAL : full_count; irq_set_affinity(irq, new_value); return full_count; } +static int default_affinity_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = cpumask_scnprintf(page, count, irq_default_affinity); + if (count - len < 2) + return -EINVAL; + len += sprintf(page + len, "\n"); + return len; +} + +static int default_affinity_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + unsigned int full_count = count, err; + cpumask_t new_value; + + err = cpumask_parse_user(buffer, count, new_value); + if (err) + return err; + + if (!is_affinity_mask_valid(new_value)) + return -EINVAL; + + /* + * Do not allow disabling IRQs completely - it's a too easy + * way to make the system unusable accidentally :-) At least + * one online CPU still has to be targeted. + */ + if (!cpus_intersects(new_value, cpu_online_map)) + return -EINVAL; + + irq_default_affinity = new_value; + + return full_count; +} #endif static int irq_spurious_read(char *page, char **start, off_t off, @@ -171,6 +205,21 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) remove_proc_entry(action->dir->name, irq_desc[irq].dir); } +void register_default_affinity_proc(void) +{ +#ifdef CONFIG_SMP + struct proc_dir_entry *entry; + + /* create /proc/irq/default_smp_affinity */ + entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir); + if (entry) { + entry->data = NULL; + entry->read_proc = default_affinity_read; + entry->write_proc = default_affinity_write; + } +#endif +} + void init_irq_proc(void) { int i; @@ -180,6 +229,8 @@ void init_irq_proc(void) if (!root_irq_dir) return; + register_default_affinity_proc(); + /* * Create entries for all existing IRQs. */ -- cgit v1.2.3-70-g09d2 From 554ec22f075d46e4363520a407d2b7eeb5dfdd43 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:03 +0200 Subject: namespacecheck: more sched.c fixes [ Stephen Rothwell : build fix ] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ae0be3c6237..dc36c3aea01 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -134,7 +134,6 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); -extern unsigned long weighted_cpuload(const int cpu); struct seq_file; struct cfs_rq; @@ -823,23 +822,6 @@ extern int arch_reinit_sched_domains(void); #endif /* CONFIG_SMP */ -/* - * A runqueue laden with a single nice 0 task scores a weighted_cpuload of - * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a - * task of nice 0 or enough lower priority tasks to bring up the - * weighted_cpuload - */ -static inline int above_background_load(void) -{ - unsigned long cpu; - - for_each_online_cpu(cpu) { - if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) - return 1; - } - return 0; -} - struct io_context; /* See blkdev.h */ #define NGROUPS_SMALL 32 #define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) -- cgit v1.2.3-70-g09d2 From c7aceaba042702538b23cf4e0de1b2891ad8e671 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Thu, 15 May 2008 12:09:15 +0100 Subject: sched: reorder task_struct to reduce padding on 64bit builds This patch removes 24 bytes of padding and allows 1 extra object per slab on my fedora based config. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index dc36c3aea01..ea2857b9959 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1021,6 +1021,7 @@ struct task_struct { #endif int prio, static_prio, normal_prio; + unsigned int rt_priority; const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; @@ -1104,7 +1105,6 @@ struct task_struct { int __user *set_child_tid; /* CLONE_CHILD_SETTID */ int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ - unsigned int rt_priority; cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; cputime_t prev_utime, prev_stime; @@ -1123,12 +1123,12 @@ struct task_struct { gid_t gid,egid,sgid,fsgid; struct group_info *group_info; kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; - unsigned securebits; struct user_struct *user; + unsigned securebits; #ifdef CONFIG_KEYS + unsigned char jit_keyring; /* default keyring to attach requested keys to */ struct key *request_key_auth; /* assumed request_key authority */ struct key *thread_keyring; /* keyring private to this thread */ - unsigned char jit_keyring; /* default keyring to attach requested keys to */ #endif char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock @@ -1215,8 +1215,8 @@ struct task_struct { # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; - struct held_lock held_locks[MAX_LOCK_DEPTH]; unsigned int lockdep_recursion; + struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif /* journalling filesystem info */ @@ -1244,10 +1244,6 @@ struct task_struct { u64 acct_vm_mem1; /* accumulated virtual memory usage */ cputime_t acct_stimexpd;/* stime since last update */ #endif -#ifdef CONFIG_NUMA - struct mempolicy *mempolicy; - short il_next; -#endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; int cpuset_mems_generation; @@ -1266,6 +1262,10 @@ struct task_struct { #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; +#endif +#ifdef CONFIG_NUMA + struct mempolicy *mempolicy; + short il_next; #endif atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; -- cgit v1.2.3-70-g09d2 From 1f11eb6a8bc92536d9e93ead48fa3ffbd1478571 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 4 Jun 2008 15:04:05 -0400 Subject: sched: fix cpupri hotplug support The RT folks over at RedHat found an issue w.r.t. hotplug support which was traced to problems with the cpupri infrastructure in the scheduler: https://bugzilla.redhat.com/show_bug.cgi?id=449676 This bug affects 23-rt12+, 24-rtX, 25-rtX, and sched-devel. This patch applies to 25.4-rt4, though it should trivially apply to most cpupri enabled kernels mentioned above. It turned out that the issue was that offline cpus could get inadvertently registered with cpupri so that they were erroneously selected during migration decisions. The end result would be an OOPS as the offline cpu had tasks routed to it. This patch generalizes the old join/leave domain interface into an online/offline interface, and adjusts the root-domain/hotplug code to utilize it. I was able to easily reproduce the issue prior to this patch, and am no longer able to reproduce it after this patch. I can offline cpus indefinately and everything seems to be in working order. Thanks to Arnaldo (acme), Thomas, and Peter for doing the legwork to point me in the right direction. Also thank you to Peter for reviewing the early iterations of this patch. Signed-off-by: Gregory Haskins Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 4 ++-- kernel/sched.c | 54 ++++++++++++++++++++++++++++++++++++++------------- kernel/sched_rt.c | 24 +++++++++++++++++------ 3 files changed, 60 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ea2857b9959..d25acf600a3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -903,8 +903,8 @@ struct sched_class { void (*set_cpus_allowed)(struct task_struct *p, const cpumask_t *newmask); - void (*join_domain)(struct rq *rq); - void (*leave_domain)(struct rq *rq); + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); void (*switched_from) (struct rq *this_rq, struct task_struct *task, int running); diff --git a/kernel/sched.c b/kernel/sched.c index dc0be113f41..f0ed81b7128 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -529,6 +529,7 @@ struct rq { int push_cpu; /* cpu of this runqueue: */ int cpu; + int online; struct task_struct *migration_thread; struct list_head migration_queue; @@ -1498,6 +1499,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #endif #define sched_class_highest (&rt_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) static inline void inc_load(struct rq *rq, const struct task_struct *p) { @@ -6065,6 +6068,36 @@ static void unregister_sched_domain_sysctl(void) } #endif +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + const struct sched_class *class; + + cpu_set(rq->cpu, rq->rd->online); + rq->online = 1; + + for_each_class(class) { + if (class->rq_online) + class->rq_online(rq); + } + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + const struct sched_class *class; + + for_each_class(class) { + if (class->rq_offline) + class->rq_offline(rq); + } + + cpu_clear(rq->cpu, rq->rd->online); + rq->online = 0; + } +} + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -6102,7 +6135,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_set(cpu, rq->rd->online); + + set_rq_online(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6163,7 +6197,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_clear(cpu, rq->rd->online); + set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6385,20 +6419,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; - const struct sched_class *class; spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { struct root_domain *old_rd = rq->rd; - for (class = sched_class_highest; class; class = class->next) { - if (class->leave_domain) - class->leave_domain(rq); - } + if (cpu_isset(rq->cpu, old_rd->online)) + set_rq_offline(rq); cpu_clear(rq->cpu, old_rd->span); - cpu_clear(rq->cpu, old_rd->online); if (atomic_dec_and_test(&old_rd->refcount)) kfree(old_rd); @@ -6409,12 +6439,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpu_set(rq->cpu, rd->span); if (cpu_isset(rq->cpu, cpu_online_map)) - cpu_set(rq->cpu, rd->online); - - for (class = sched_class_highest; class; class = class->next) { - if (class->join_domain) - class->join_domain(rq); - } + set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } @@ -7824,6 +7849,7 @@ void __init sched_init(void) rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; + rq->online = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 44b06d75416..e4821593d4d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq) static inline void rt_set_overload(struct rq *rq) { + if (!rq->online) + return; + cpu_set(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set @@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq) static inline void rt_clear_overload(struct rq *rq) { + if (!rq->online) + return; + /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); cpu_clear(rq->cpu, rq->rd->rto_mask); @@ -394,7 +400,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_se_prio(rt_se) < rt_rq->highest_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); rt_rq->highest_prio = rt_se_prio(rt_se); - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se)); + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + rt_se_prio(rt_se)); } #endif #ifdef CONFIG_SMP @@ -448,7 +457,10 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_rq->highest_prio != highest_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + rt_rq->highest_prio); } update_rt_migration(rq_of_rt_rq(rt_rq)); @@ -1154,7 +1166,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, } /* Assumes rq->lock is held */ -static void join_domain_rt(struct rq *rq) +static void rq_online_rt(struct rq *rq) { if (rq->rt.overloaded) rt_set_overload(rq); @@ -1163,7 +1175,7 @@ static void join_domain_rt(struct rq *rq) } /* Assumes rq->lock is held */ -static void leave_domain_rt(struct rq *rq) +static void rq_offline_rt(struct rq *rq) { if (rq->rt.overloaded) rt_clear_overload(rq); @@ -1331,8 +1343,8 @@ static const struct sched_class rt_sched_class = { .load_balance = load_balance_rt, .move_one_task = move_one_task_rt, .set_cpus_allowed = set_cpus_allowed_rt, - .join_domain = join_domain_rt, - .leave_domain = leave_domain_rt, + .rq_online = rq_online_rt, + .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_wake_up = task_wake_up_rt, -- cgit v1.2.3-70-g09d2 From cc1a9d86ce989083703c4bdc11b75a87e1cc404a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 8 Jun 2008 19:39:16 -0700 Subject: mm, x86: shrink_active_range() should check all Now we are using register_e820_active_regions() instead of add_active_range() directly. So end_pfn could be different between the value in early_node_map to node_end_pfn. So we need to make shrink_active_range() smarter. shrink_active_range() is a generic MM function in mm/page_alloc.c but it is only used on 32-bit x86. Should we move it back to some file in arch/x86? Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 2 +- include/linux/mm.h | 3 +-- mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++---------- 3 files changed, 36 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index a89ccf3d4c1..489605bab85 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -282,7 +282,7 @@ static unsigned long calculate_numa_remap_pages(void) node_end_pfn[nid] -= size; node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); + shrink_active_range(nid, node_end_pfn[nid]); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index c31a9cd2a30..7cbd949f251 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -997,8 +997,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn); +extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 502223c3c2c..21540868407 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3579,25 +3579,49 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, /** * shrink_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @old_end_pfn: The old end PFN of the range * @new_end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. - * The map is kept at the end physical page range that has already been - * registered with add_active_range(). This function allows an arch to shrink - * an existing registered range. + * The map is kept near the end physical page range that has already been + * registered. This function allows an arch to shrink an existing registered + * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn) +void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) { - int i; + int i, j; + int removed = 0; /* Find the old active region end and shrink */ - for_each_active_range_index_in_nid(i, nid) - if (early_node_map[i].end_pfn == old_end_pfn) { + for_each_active_range_index_in_nid(i, nid) { + if (early_node_map[i].start_pfn >= new_end_pfn) { + /* clear it */ + early_node_map[i].end_pfn = 0; + removed = 1; + continue; + } + if (early_node_map[i].end_pfn > new_end_pfn) { early_node_map[i].end_pfn = new_end_pfn; - break; + continue; } + } + + if (!removed) + return; + + /* remove the blank ones */ + for (i = nr_nodemap_entries - 1; i > 0; i--) { + if (early_node_map[i].nid != nid) + continue; + if (early_node_map[i].end_pfn) + continue; + /* we found it, get rid of it */ + for (j = i; j < nr_nodemap_entries - 1; j++) + memcpy(&early_node_map[j], &early_node_map[j+1], + sizeof(early_node_map[j])); + j = nr_nodemap_entries - 1; + memset(&early_node_map[j], 0, sizeof(early_node_map[j])); + nr_nodemap_entries--; + } } /** -- cgit v1.2.3-70-g09d2 From 0eb967012ea15e6e8cfab483d9fa37bc602d400c Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sun, 1 Jun 2008 21:47:30 +0530 Subject: ftrace: prevent freeing of all failed updates Prevent freeing of records which cause problems and correspond to function from core kernel text. A new flag, FTRACE_FL_CONVERTED is used to mark a record as "converted". All other records are patched lazily to NOPs. Failed records now also remain on frace_hash table. Each invocation of ftrace_record_ip now checks whether the traced function has ever been recorded (including past failures) and doesn't re-record it again. Signed-off-by: Abhishek Sagar Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 76 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 47 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 623819433ed..20e14d0093c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -49,6 +49,7 @@ enum { FTRACE_FL_FILTER = (1 << 2), FTRACE_FL_ENABLED = (1 << 3), FTRACE_FL_NOTRACE = (1 << 4), + FTRACE_FL_CONVERTED = (1 << 5), }; struct dyn_ftrace { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f762f5a2d33..ec54cb7d69d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -216,6 +216,12 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key) hlist_add_head_rcu(&node->node, &ftrace_hash[key]); } +/* called from kstop_machine */ +static inline void ftrace_del_hash(struct dyn_ftrace *node) +{ + hlist_del(&node->node); +} + static void ftrace_free_rec(struct dyn_ftrace *rec) { /* no locking, only called from kstop_machine */ @@ -332,12 +338,11 @@ ftrace_record_ip(unsigned long ip) #define FTRACE_ADDR ((long)(ftrace_caller)) #define MCOUNT_ADDR ((long)(mcount)) -static void +static int __ftrace_replace_code(struct dyn_ftrace *rec, unsigned char *old, unsigned char *new, int enable) { unsigned long ip, fl; - int failed; ip = rec->ip; @@ -364,7 +369,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE)) - return; + return 0; /* * If it is enabled disable it, @@ -388,7 +393,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, */ fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); if (fl == FTRACE_FL_NOTRACE) - return; + return 0; new = ftrace_call_replace(ip, FTRACE_ADDR); } else @@ -396,34 +401,24 @@ __ftrace_replace_code(struct dyn_ftrace *rec, if (enable) { if (rec->flags & FTRACE_FL_ENABLED) - return; + return 0; rec->flags |= FTRACE_FL_ENABLED; } else { if (!(rec->flags & FTRACE_FL_ENABLED)) - return; + return 0; rec->flags &= ~FTRACE_FL_ENABLED; } } - failed = ftrace_modify_code(ip, old, new); - if (failed) { - unsigned long key; - /* It is possible that the function hasn't been converted yet */ - key = hash_long(ip, FTRACE_HASHBITS); - if (!ftrace_ip_in_hash(ip, key)) { - rec->flags |= FTRACE_FL_FAILED; - ftrace_free_rec(rec); - } - - } + return ftrace_modify_code(ip, old, new); } static void ftrace_replace_code(int enable) { + int i, failed; unsigned char *new = NULL, *old = NULL; struct dyn_ftrace *rec; struct ftrace_page *pg; - int i; if (enable) old = ftrace_nop_replace(); @@ -438,7 +433,15 @@ static void ftrace_replace_code(int enable) if (rec->flags & FTRACE_FL_FAILED) continue; - __ftrace_replace_code(rec, old, new, enable); + failed = __ftrace_replace_code(rec, old, new, enable); + if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { + rec->flags |= FTRACE_FL_FAILED; + if ((system_state == SYSTEM_BOOTING) || + !kernel_text_address(rec->ip)) { + ftrace_del_hash(rec); + ftrace_free_rec(rec); + } + } } } } @@ -467,7 +470,6 @@ ftrace_code_disable(struct dyn_ftrace *rec) failed = ftrace_modify_code(ip, call, nop); if (failed) { rec->flags |= FTRACE_FL_FAILED; - ftrace_free_rec(rec); return 0; } return 1; @@ -621,8 +623,7 @@ unsigned long ftrace_update_tot_cnt; static int __ftrace_update_code(void *ignore) { struct dyn_ftrace *p; - struct hlist_head head; - struct hlist_node *t; + struct hlist_node *t, *n; int save_ftrace_enabled; cycle_t start, stop; int i; @@ -637,18 +638,33 @@ static int __ftrace_update_code(void *ignore) /* No locks needed, the machine is stopped! */ for (i = 0; i < FTRACE_HASHSIZE; i++) { - if (hlist_empty(&ftrace_hash[i])) - continue; + /* all CPUS are stopped, we are safe to modify code */ + hlist_for_each_entry_safe(p, t, n, &ftrace_hash[i], node) { + /* Skip over failed records which have not been + * freed. */ + if (p->flags & FTRACE_FL_FAILED) + continue; - head = ftrace_hash[i]; - INIT_HLIST_HEAD(&ftrace_hash[i]); + /* Unconverted records are always at the head of the + * hash bucket. Once we encounter a converted record, + * simply skip over to the next bucket. Saves ftraced + * some processor cycles (ftrace does its bid for + * global warming :-p ). */ + if (p->flags & (FTRACE_FL_CONVERTED)) + break; - /* all CPUS are stopped, we are safe to modify code */ - hlist_for_each_entry(p, t, &head, node) { - if (ftrace_code_disable(p)) + if (ftrace_code_disable(p)) { + p->flags |= FTRACE_FL_CONVERTED; ftrace_update_cnt++; - } + } else { + if ((system_state == SYSTEM_BOOTING) || + !kernel_text_address(p->ip)) { + ftrace_del_hash(p); + ftrace_free_rec(p); + } + } + } } stop = ftrace_now(raw_smp_processor_id()); -- cgit v1.2.3-70-g09d2 From 9985b0bab332289f14837eff3c6e0bcc658b58f7 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Jun 2008 12:57:11 -0700 Subject: sched: prevent bound kthreads from changing cpus_allowed Kthreads that have called kthread_bind() are bound to specific cpus, so other tasks should not be able to change their cpus_allowed from under them. Otherwise, it is possible to move kthreads, such as the migration or software watchdog threads, so they are not allowed access to the cpu they work on. Cc: Peter Zijlstra Cc: Paul Menage Cc: Paul Jackson Signed-off-by: David Rientjes Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/cpuset.c | 14 +++++++++++++- kernel/kthread.c | 1 + kernel/sched.c | 6 ++++++ 4 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d25acf600a3..2db1485f865 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1486,6 +1486,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6090d18b58a..b84354f4de3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1190,6 +1190,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; + if (tsk->flags & PF_THREAD_BOUND) { + cpumask_t mask; + + mutex_lock(&callback_mutex); + mask = cs->cpus_allowed; + mutex_unlock(&callback_mutex); + if (!cpus_equal(tsk->cpus_allowed, mask)) + return -EINVAL; + } return security_task_setscheduler(tsk, 0, NULL); } @@ -1203,11 +1212,14 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); + int err; mutex_lock(&callback_mutex); guarantee_online_cpus(cs, &cpus); - set_cpus_allowed_ptr(tsk, &cpus); + err = set_cpus_allowed_ptr(tsk, &cpus); mutex_unlock(&callback_mutex); + if (err) + return; from = oldcs->mems_allowed; to = cs->mems_allowed; diff --git a/kernel/kthread.c b/kernel/kthread.c index bd1b9ea024e..97747cdd37c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; + k->flags |= PF_THREAD_BOUND; } EXPORT_SYMBOL(kthread_bind); diff --git a/kernel/sched.c b/kernel/sched.c index e9c24a12865..164fe7fe0d8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5563,6 +5563,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) goto out; } + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && + !cpus_equal(p->cpus_allowed, *new_mask))) { + ret = -EINVAL; + goto out; + } + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { -- cgit v1.2.3-70-g09d2 From c50cbb05a05cf1f9ca3592272eff053c847727d8 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 4 Jun 2008 21:47:29 -0700 Subject: cpu topology: always define CPU topology information This can result in an empty topology directory in sysfs, and requires in-kernel users to protect all uses with #ifdef - see . The documentation of CPU topology specifies what the defaults should be if only partial information is available from the hardware. So we can provide these defaults as a fallback. This patch: - Adds default definitions of the 4 topology macros to - Changes drivers/base/topology.c to use the topology macros unconditionally and to cope with definitions that aren't lvalues - Updates documentation accordingly [ From: Andrew Morton - fold now-duplicated code - fix layout ] Signed-off-by: Ben Hutchings Cc: Vegard Nossum Cc: Nick Piggin Cc: Chandra Seetharaman Cc: Suresh Siddha Cc: Mike Travis Cc: Christoph Lameter Cc: John Hawkes Cc: Zhang, Yanmin Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- Documentation/cputopology.txt | 26 +++++++++----------------- drivers/base/topology.c | 38 ++++++++++---------------------------- include/linux/topology.h | 13 +++++++++++++ 3 files changed, 32 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt index b61cb956402..bd699da2466 100644 --- a/Documentation/cputopology.txt +++ b/Documentation/cputopology.txt @@ -14,9 +14,8 @@ represent the thread siblings to cpu X in the same physical package; To implement it in an architecture-neutral way, a new source file, drivers/base/topology.c, is to export the 4 attributes. -If one architecture wants to support this feature, it just needs to -implement 4 defines, typically in file include/asm-XXX/topology.h. -The 4 defines are: +For an architecture to support this feature, it must define some of +these macros in include/asm-XXX/topology.h: #define topology_physical_package_id(cpu) #define topology_core_id(cpu) #define topology_thread_siblings(cpu) @@ -25,17 +24,10 @@ The 4 defines are: The type of **_id is int. The type of siblings is cpumask_t. -To be consistent on all architectures, the 4 attributes should have -default values if their values are unavailable. Below is the rule. -1) physical_package_id: If cpu has no physical package id, -1 is the -default value. -2) core_id: If cpu doesn't support multi-core, its core id is 0. -3) thread_siblings: Just include itself, if the cpu doesn't support -HT/multi-thread. -4) core_siblings: Just include itself, if the cpu doesn't support -multi-core and HT/Multi-thread. - -So be careful when declaring the 4 defines in include/asm-XXX/topology.h. - -If an attribute isn't defined on an architecture, it won't be exported. - +To be consistent on all architectures, include/linux/topology.h +provides default definitions for any of the above macros that are +not defined by include/asm-XXX/topology.h: +1) physical_package_id: -1 +2) core_id: 0 +3) thread_siblings: just the given CPU +4) core_siblings: just the given CPU diff --git a/drivers/base/topology.c b/drivers/base/topology.c index fdf4044d2e7..24d29a9fc25 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -59,60 +59,42 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) static inline ssize_t show_##name(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ - return show_cpumap(0, &(topology_##name(cpu)), buf); \ + cpumask_t siblings = topology_##name(cpu); \ + return show_cpumap(0, &siblings, buf); \ } #define define_siblings_show_list(name) \ static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ - return show_cpumap(1, &(topology_##name(cpu)), buf); \ + cpumask_t siblings = topology_##name(cpu); \ + return show_cpumap(1, &siblings, buf); \ } #define define_siblings_show_func(name) \ define_siblings_show_map(name); define_siblings_show_list(name) -#ifdef topology_physical_package_id define_id_show_func(physical_package_id); define_one_ro(physical_package_id); -#define ref_physical_package_id_attr &attr_physical_package_id.attr, -#else -#define ref_physical_package_id_attr -#endif -#ifdef topology_core_id define_id_show_func(core_id); define_one_ro(core_id); -#define ref_core_id_attr &attr_core_id.attr, -#else -#define ref_core_id_attr -#endif -#ifdef topology_thread_siblings define_siblings_show_func(thread_siblings); define_one_ro(thread_siblings); define_one_ro(thread_siblings_list); -#define ref_thread_siblings_attr \ - &attr_thread_siblings.attr, &attr_thread_siblings_list.attr, -#else -#define ref_thread_siblings_attr -#endif -#ifdef topology_core_siblings define_siblings_show_func(core_siblings); define_one_ro(core_siblings); define_one_ro(core_siblings_list); -#define ref_core_siblings_attr \ - &attr_core_siblings.attr, &attr_core_siblings_list.attr, -#else -#define ref_core_siblings_attr -#endif static struct attribute *default_attrs[] = { - ref_physical_package_id_attr - ref_core_id_attr - ref_thread_siblings_attr - ref_core_siblings_attr + &attr_physical_package_id.attr, + &attr_core_id.attr, + &attr_thread_siblings.attr, + &attr_thread_siblings_list.attr, + &attr_core_siblings.attr, + &attr_core_siblings_list.attr, NULL }; diff --git a/include/linux/topology.h b/include/linux/topology.h index 24f3d2282e1..2158fc0d5a5 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -179,4 +179,17 @@ void arch_update_cpu_topology(void); #endif #endif /* CONFIG_NUMA */ +#ifndef topology_physical_package_id +#define topology_physical_package_id(cpu) ((void)(cpu), -1) +#endif +#ifndef topology_core_id +#define topology_core_id(cpu) ((void)(cpu), 0) +#endif +#ifndef topology_thread_siblings +#define topology_thread_siblings(cpu) cpumask_of_cpu(cpu) +#endif +#ifndef topology_core_siblings +#define topology_core_siblings(cpu) cpumask_of_cpu(cpu) +#endif + #endif /* _LINUX_TOPOLOGY_H */ -- cgit v1.2.3-70-g09d2 From e17ba73b0ee6c0f24393c48b455e0d8db761782c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 12 May 2008 15:44:40 +0200 Subject: x86, generic: mark early_printk as asmlinkage It's not explicitly marked as asmlinkage, but invoked from x86_32 startup code with parameters on stack. No other architectures define early_printk and none of them are affected by this change, since defines asmlinkage as empty token. Signed-off-by: Jiri Slaby Cc: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 2 +- include/linux/kernel.h | 2 +- kernel/printk.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 643fd861b72..ff9e7350da5 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -196,7 +196,7 @@ static struct console simnow_console = { static struct console *early_console = &early_vga_console; static int early_console_initialized; -void early_printk(const char *fmt, ...) +asmlinkage void early_printk(const char *fmt, ...) { char buf[512]; int n; diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f2a668c195b..4cb8d3df414 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -207,7 +207,7 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ { return false; } #endif -extern void __attribute__((format(printf, 1, 2))) +extern void asmlinkage __attribute__((format(printf, 1, 2))) early_printk(const char *fmt, ...); unsigned long int_sqrt(unsigned long); diff --git a/kernel/printk.c b/kernel/printk.c index 70cfa5ac75c..de1a4f4470c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -38,7 +38,7 @@ /* * Architectures can override it: */ -void __attribute__((weak)) early_printk(const char *fmt, ...) +void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) { } -- cgit v1.2.3-70-g09d2 From 443cd507ce7f78c6f8742b72736585c031d5a921 Mon Sep 17 00:00:00 2001 From: "Huang, Ying" Date: Fri, 20 Jun 2008 16:39:21 +0800 Subject: lockdep: add lock_class information to lock_chain and output it This patch records array of lock_class into lock_chain, and export lock_chain information via /proc/lockdep_chains. It is based on x86/master branch of git-x86 tree, and has been tested on x86_64 platform. Signed-off-by: Huang Ying Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 3 ++ kernel/lockdep.c | 38 +++++++++++++++++-- kernel/lockdep_internals.h | 6 +++ kernel/lockdep_proc.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 4c4d236ded1..b26fbc715a5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -182,6 +182,9 @@ struct lock_list { * We record lock dependency chains, so that we can cache them: */ struct lock_chain { + u8 irq_context; + u8 depth; + u16 base; struct list_head entry; u64 chain_key; }; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 81a4e4a3f08..a796f1f38ac 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1458,7 +1458,14 @@ out_bug: } unsigned long nr_lock_chains; -static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +atomic_t nr_chain_hlocks; +static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; + +struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) +{ + return lock_classes + chain_hlocks[chain->base + i]; +} /* * Look up a dependency chain. If the key is not present yet then @@ -1466,10 +1473,15 @@ static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; * validated. If the key is already hashed, return 0. * (On return with 1 graph_lock is held.) */ -static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) +static inline int lookup_chain_cache(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) { + struct lock_class *class = hlock->class; struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; + struct held_lock *hlock_curr, *hlock_next; + int i, j, n; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; @@ -1517,6 +1529,26 @@ cache_hit: } chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; + chain->irq_context = hlock->irq_context; + /* Find the first held_lock of current chain */ + hlock_next = hlock; + for (i = curr->lockdep_depth - 1; i >= 0; i--) { + hlock_curr = curr->held_locks + i; + if (hlock_curr->irq_context != hlock_next->irq_context) + break; + hlock_next = hlock; + } + i++; + chain->depth = curr->lockdep_depth + 1 - i; + n = atomic_add_return(chain->depth, &nr_chain_hlocks); + if (unlikely(n < MAX_LOCKDEP_CHAIN_HLOCKS)) { + chain->base = n - chain->depth; + for (j = 0; j < chain->depth - 1; j++, i++) { + int lock_id = curr->held_locks[i].class - lock_classes; + chain_hlocks[chain->base + j] = lock_id; + } + chain_hlocks[chain->base + j] = class - lock_classes; + } list_add_tail_rcu(&chain->entry, hash_head); debug_atomic_inc(&chain_lookup_misses); inc_chains(); @@ -1538,7 +1570,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * graph_lock for us) */ if (!hlock->trylock && (hlock->check == 2) && - lookup_chain_cache(chain_key, hlock->class)) { + lookup_chain_cache(curr, hlock, chain_key)) { /* * Check whether last held lock: * diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 8ce09bc4613..db09b176dd3 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -23,6 +23,8 @@ #define MAX_LOCKDEP_CHAINS_BITS 14 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) + /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. @@ -30,15 +32,19 @@ #define MAX_STACK_TRACE_ENTRIES 262144UL extern struct list_head all_lock_classes; +extern struct lock_chain lock_chains[]; extern void get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); +struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); + extern unsigned long nr_lock_classes; extern unsigned long nr_list_entries; extern unsigned long nr_lock_chains; +extern atomic_t nr_chain_hlocks; extern unsigned long nr_stack_trace_entries; extern unsigned int nr_hardirq_chains; diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 688c5f1940b..14d052c8a83 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -178,6 +178,93 @@ static const struct file_operations proc_lockdep_operations = { .release = seq_release, }; +static void *lc_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct lock_chain *chain; + + (*pos)++; + + if (v == SEQ_START_TOKEN) + chain = m->private; + else { + chain = v; + + if (*pos < nr_lock_chains) + chain = lock_chains + *pos; + else + chain = NULL; + } + + return chain; +} + +static void *lc_start(struct seq_file *m, loff_t *pos) +{ + if (*pos == 0) + return SEQ_START_TOKEN; + + if (*pos < nr_lock_chains) + return lock_chains + *pos; + + return NULL; +} + +static void lc_stop(struct seq_file *m, void *v) +{ +} + +static int lc_show(struct seq_file *m, void *v) +{ + struct lock_chain *chain = v; + struct lock_class *class; + int i; + + if (v == SEQ_START_TOKEN) { + seq_printf(m, "all lock chains:\n"); + return 0; + } + + seq_printf(m, "irq_context: %d\n", chain->irq_context); + + for (i = 0; i < chain->depth; i++) { + class = lock_chain_get_class(chain, i); + seq_printf(m, "[%p] ", class->key); + print_name(m, class); + seq_puts(m, "\n"); + } + seq_puts(m, "\n"); + + return 0; +} + +static const struct seq_operations lockdep_chains_ops = { + .start = lc_start, + .next = lc_next, + .stop = lc_stop, + .show = lc_show, +}; + +static int lockdep_chains_open(struct inode *inode, struct file *file) +{ + int res = seq_open(file, &lockdep_chains_ops); + if (!res) { + struct seq_file *m = file->private_data; + + if (nr_lock_chains) + m->private = lock_chains; + else + m->private = NULL; + } + return res; +} + +static const struct file_operations proc_lockdep_chains_operations = { + .open = lockdep_chains_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void lockdep_stats_debug_show(struct seq_file *m) { #ifdef CONFIG_DEBUG_LOCKDEP @@ -294,6 +381,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", nr_lock_chains, MAX_LOCKDEP_CHAINS); + seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", + atomic_read(&nr_chain_hlocks), MAX_LOCKDEP_CHAIN_HLOCKS); #endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -661,6 +750,8 @@ static const struct file_operations proc_lock_stat_operations = { static int __init lockdep_proc_init(void) { proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); + proc_create("lockdep_chains", S_IRUSR, NULL, + &proc_lockdep_chains_operations); proc_create("lockdep_stats", S_IRUSR, NULL, &proc_lockdep_stats_operations); -- cgit v1.2.3-70-g09d2 From 0b2806768899dba5967bcd4a3b93eaed9a1dc4f3 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Sun, 18 May 2008 14:27:41 -0600 Subject: Add cycle_kernel_lock() A number of driver functions are so obviously trivial that they do not need the big kernel lock - at least not overtly. It turns out that the acquisition of the BKL in driver open() functions can perform a sort of poor-hacker's serialization function, delaying the open operation until the driver is certain to have completed its initialization. Add a simple cycle_kernel_lock() function for these cases to make it clear that there is no need to *hold* the BKL, just to be sure that we can acquire it. Signed-off-by: Jonathan Corbet --- include/linux/smp_lock.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h index aab3a4cff4e..813be59bf34 100644 --- a/include/linux/smp_lock.h +++ b/include/linux/smp_lock.h @@ -27,11 +27,24 @@ static inline int reacquire_kernel_lock(struct task_struct *task) extern void __lockfunc lock_kernel(void) __acquires(kernel_lock); extern void __lockfunc unlock_kernel(void) __releases(kernel_lock); +/* + * Various legacy drivers don't really need the BKL in a specific + * function, but they *do* need to know that the BKL became available. + * This function just avoids wrapping a bunch of lock/unlock pairs + * around code which doesn't really need it. + */ +static inline void cycle_kernel_lock(void) +{ + lock_kernel(); + unlock_kernel(); +} + #else #define lock_kernel() do { } while(0) #define unlock_kernel() do { } while(0) #define release_kernel_lock(task) do { } while(0) +#define cycle_kernel_lock() do { } while(0) #define reacquire_kernel_lock(task) 0 #define kernel_locked() 1 -- cgit v1.2.3-70-g09d2 From 20d4fdc1a788e4ca0aaf2422772ba668e7e10839 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Mon, 9 Jun 2008 16:40:36 -0700 Subject: [patch 2/4] fs: make struct file arg to d_path const Signed-off-by: Jan Engelhardt Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 2 +- include/linux/dcache.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index 3ee588d5f58..c4c9072d810 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1847,7 +1847,7 @@ Elong: * * "buflen" should be positive. Caller holds the dcache_lock. */ -char *d_path(struct path *path, char *buf, int buflen) +char *d_path(const struct path *path, char *buf, int buflen) { char *res; struct path root; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 2a6639407c8..d982eb89c77 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -300,7 +300,7 @@ extern int d_validate(struct dentry *, struct dentry *); extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...); extern char *__d_path(const struct path *path, struct path *root, char *, int); -extern char *d_path(struct path *, char *, int); +extern char *d_path(const struct path *, char *, int); extern char *dentry_path(struct dentry *, char *, int); /* Allocation counts.. */ -- cgit v1.2.3-70-g09d2 From f9f48ec72bfc9489a30bc6ddbfcf27d86a8bc651 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Mon, 9 Jun 2008 16:40:38 -0700 Subject: [patch 4/4] flock: remove unused fields from file_lock_operations fl_insert and fl_remove are not used right now in the kernel. Remove them. Signed-off-by: Denis V. Lunev Cc: Matthew Wilcox Cc: Alexander Viro Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/locks.c | 6 ------ include/linux/fs.h | 2 -- 2 files changed, 8 deletions(-) (limited to 'include/linux') diff --git a/fs/locks.c b/fs/locks.c index 11dbf08651b..dce8c747371 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -561,9 +561,6 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) /* insert into file's list */ fl->fl_next = *pos; *pos = fl; - - if (fl->fl_ops && fl->fl_ops->fl_insert) - fl->fl_ops->fl_insert(fl); } /* @@ -586,9 +583,6 @@ static void locks_delete_lock(struct file_lock **thisfl_p) fl->fl_fasync = NULL; } - if (fl->fl_ops && fl->fl_ops->fl_remove) - fl->fl_ops->fl_remove(fl); - if (fl->fl_nspid) { put_pid(fl->fl_nspid); fl->fl_nspid = NULL; diff --git a/include/linux/fs.h b/include/linux/fs.h index d490779f18d..7c108082683 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -894,8 +894,6 @@ static inline int file_check_writeable(struct file *filp) typedef struct files_struct *fl_owner_t; struct file_lock_operations { - void (*fl_insert)(struct file_lock *); /* lock insertion callback */ - void (*fl_remove)(struct file_lock *); /* lock removal callback */ void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); }; -- cgit v1.2.3-70-g09d2 From 395a59d0f8e86bb39cd700c3d185d30c670bb958 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 21 Jun 2008 23:47:27 +0530 Subject: ftrace: store mcount address in rec->ip Record the address of the mcount call-site. Currently all archs except sparc64 record the address of the instruction following the mcount call-site. Some general cleanups are entailed. Storing mcount addresses in rec->ip enables looking them up in the kprobe hash table later on to check if they're kprobe'd. Signed-off-by: Abhishek Sagar Cc: davem@davemloft.net Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/arm/kernel/armksyms.c | 10 +++++----- arch/arm/kernel/entry-common.S | 4 ++++ arch/arm/kernel/ftrace.c | 16 +++++++--------- arch/powerpc/kernel/entry_32.S | 4 ++++ arch/powerpc/kernel/entry_64.S | 5 ++++- arch/powerpc/kernel/ftrace.c | 21 +++++++-------------- arch/sparc64/kernel/ftrace.c | 10 ++++++---- arch/sparc64/kernel/sparc64_ksyms.c | 2 +- arch/x86/kernel/entry_32.S | 4 ++++ arch/x86/kernel/entry_64.S | 4 ++++ arch/x86/kernel/ftrace.c | 26 +++++++++----------------- arch/x86/kernel/i386_ksyms_32.c | 2 +- arch/x86/kernel/x8664_ksyms_64.c | 2 +- include/asm-arm/ftrace.h | 14 ++++++++++++++ include/asm-powerpc/ftrace.h | 8 ++++++++ include/asm-sparc64/ftrace.h | 14 ++++++++++++++ include/asm-x86/ftrace.h | 14 ++++++++++++++ include/linux/ftrace.h | 3 +-- kernel/trace/ftrace.c | 3 ++- 19 files changed, 110 insertions(+), 56 deletions(-) create mode 100644 include/asm-arm/ftrace.h create mode 100644 include/asm-sparc64/ftrace.h create mode 100644 include/asm-x86/ftrace.h (limited to 'include/linux') diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c index 3b132215cbf..cc7b246e965 100644 --- a/arch/arm/kernel/armksyms.c +++ b/arch/arm/kernel/armksyms.c @@ -18,6 +18,7 @@ #include #include #include +#include /* * libgcc functions - functions that are used internally by the @@ -48,11 +49,6 @@ extern void __aeabi_ulcmp(void); extern void fpundefinstr(void); extern void fp_enter(void); -#ifdef CONFIG_FTRACE -extern void mcount(void); -EXPORT_SYMBOL(mcount); -#endif - /* * This has a special calling convention; it doesn't * modify any of the usual registers, except for LR. @@ -186,3 +182,7 @@ EXPORT_SYMBOL(_find_next_bit_be); #endif EXPORT_SYMBOL(copy_page); + +#ifdef CONFIG_FTRACE +EXPORT_SYMBOL(mcount); +#endif diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 8f79a4789ed..84694e88b42 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -9,6 +9,7 @@ */ #include +#include #include #include "entry-header.S" @@ -104,6 +105,7 @@ ENTRY(ret_from_fork) ENTRY(mcount) stmdb sp!, {r0-r3, lr} mov r0, lr + sub r0, r0, #MCOUNT_INSN_SIZE .globl mcount_call mcount_call: @@ -114,6 +116,7 @@ ENTRY(ftrace_caller) stmdb sp!, {r0-r3, lr} ldr r1, [fp, #-4] mov r0, lr + sub r0, r0, #MCOUNT_INSN_SIZE .globl ftrace_call ftrace_call: @@ -134,6 +137,7 @@ ENTRY(mcount) trace: ldr r1, [fp, #-4] mov r0, lr + sub r0, r0, #MCOUNT_INSN_SIZE mov lr, pc mov pc, r2 ldmia sp!, {r0-r3, pc} diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index 22f3d6e309f..76d50e6091b 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -12,9 +12,10 @@ */ #include + #include +#include -#define INSN_SIZE 4 #define PC_OFFSET 8 #define BL_OPCODE 0xeb000000 #define BL_OFFSET_MASK 0x00ffffff @@ -32,10 +33,10 @@ unsigned char *ftrace_call_replace(unsigned long pc, unsigned long addr) { long offset; - offset = (long)addr - (long)(pc - INSN_SIZE + PC_OFFSET); + offset = (long)addr - (long)(pc + PC_OFFSET); if (unlikely(offset < -33554432 || offset > 33554428)) { /* Can't generate branches that far (from ARM ARM). Ftrace - * doesn't generate branches outside of core kernel text. + * doesn't generate branches outside of kernel text. */ WARN_ON_ONCE(1); return NULL; @@ -52,7 +53,6 @@ int ftrace_modify_code(unsigned long pc, unsigned char *old_code, old = *(unsigned long *)old_code; new = *(unsigned long *)new_code; - pc -= INSN_SIZE; __asm__ __volatile__ ( "1: ldr %1, [%2] \n" @@ -77,7 +77,7 @@ int ftrace_modify_code(unsigned long pc, unsigned char *old_code, : "memory"); if (!err && (replaced == old)) - flush_icache_range(pc, pc + INSN_SIZE); + flush_icache_range(pc, pc + MCOUNT_INSN_SIZE); return err; } @@ -89,8 +89,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func) unsigned char *new; pc = (unsigned long)&ftrace_call; - pc += INSN_SIZE; - memcpy(&old, &ftrace_call, INSN_SIZE); + memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(pc, (unsigned long)func); ret = ftrace_modify_code(pc, (unsigned char *)&old, new); return ret; @@ -103,8 +102,7 @@ int ftrace_mcount_set(unsigned long *data) unsigned char *new; pc = (unsigned long)&mcount_call; - pc += INSN_SIZE; - memcpy(&old, &mcount_call, INSN_SIZE); + memcpy(&old, &mcount_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(pc, *addr); *addr = ftrace_modify_code(pc, (unsigned char *)&old, new); return 0; diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 3b1dd29d9f9..7231a708af0 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -30,6 +30,7 @@ #include #include #include +#include #undef SHOW_SYSCALLS #undef SHOW_SYSCALLS_TASK @@ -1053,6 +1054,7 @@ _GLOBAL(_mcount) stw r10,40(r1) stw r3, 44(r1) stw r5, 8(r1) + subi r3, r3, MCOUNT_INSN_SIZE .globl mcount_call mcount_call: bl ftrace_stub @@ -1090,6 +1092,7 @@ _GLOBAL(ftrace_caller) stw r10,40(r1) stw r3, 44(r1) stw r5, 8(r1) + subi r3, r3, MCOUNT_INSN_SIZE .globl ftrace_call ftrace_call: bl ftrace_stub @@ -1128,6 +1131,7 @@ _GLOBAL(_mcount) stw r3, 44(r1) stw r5, 8(r1) + subi r3, r3, MCOUNT_INSN_SIZE LOAD_REG_ADDR(r5, ftrace_trace_function) lwz r5,0(r5) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 2c4d9e056ea..2f511a969d2 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -31,6 +31,7 @@ #include #include #include +#include /* * System calls. @@ -879,6 +880,7 @@ _GLOBAL(_mcount) mflr r3 stdu r1, -112(r1) std r3, 128(r1) + subi r3, r3, MCOUNT_INSN_SIZE .globl mcount_call mcount_call: bl ftrace_stub @@ -895,6 +897,7 @@ _GLOBAL(ftrace_caller) stdu r1, -112(r1) std r3, 128(r1) ld r4, 16(r11) + subi r3, r3, MCOUNT_INSN_SIZE .globl ftrace_call ftrace_call: bl ftrace_stub @@ -916,7 +919,7 @@ _GLOBAL(_mcount) std r3, 128(r1) ld r4, 16(r11) - + subi r3, r3, MCOUNT_INSN_SIZE LOAD_REG_ADDR(r5,ftrace_trace_function) ld r5,0(r5) ld r5,0(r5) diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index e12c593ab9c..3855ceb937b 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -15,8 +15,8 @@ #include #include +#include -#define CALL_BACK 4 static unsigned int ftrace_nop = 0x60000000; @@ -27,9 +27,10 @@ static unsigned int ftrace_nop = 0x60000000; # define GET_ADDR(addr) *(unsigned long *)addr #endif + static unsigned int notrace ftrace_calc_offset(long ip, long addr) { - return (int)((addr + CALL_BACK) - ip); + return (int)(addr - ip); } notrace unsigned char *ftrace_nop_replace(void) @@ -76,9 +77,6 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned new = *(unsigned *)new_code; int faulted = 0; - /* move the IP back to the start of the call */ - ip -= CALL_BACK; - /* * Note: Due to modules and __init, code can * disappear and change, we need to protect against faulting @@ -118,12 +116,10 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, notrace int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char old[4], *new; + unsigned char old[MCOUNT_INSN_SIZE], *new; int ret; - ip += CALL_BACK; - - memcpy(old, &ftrace_call, 4); + memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, (unsigned long)func); ret = ftrace_modify_code(ip, old, new); @@ -134,16 +130,13 @@ notrace int ftrace_mcount_set(unsigned long *data) { unsigned long ip = (long)(&mcount_call); unsigned long *addr = data; - unsigned char old[4], *new; - - /* ip is at the location, but modify code will subtact this */ - ip += CALL_BACK; + unsigned char old[MCOUNT_INSN_SIZE], *new; /* * Replace the mcount stub with a pointer to the * ip recorder function. */ - memcpy(old, &mcount_call, 4); + memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, *addr); *addr = ftrace_modify_code(ip, old, new); diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c index c17373195b1..4298d0aee71 100644 --- a/arch/sparc64/kernel/ftrace.c +++ b/arch/sparc64/kernel/ftrace.c @@ -5,6 +5,8 @@ #include #include +#include + static const u32 ftrace_nop = 0x01000000; notrace unsigned char *ftrace_nop_replace(void) @@ -60,9 +62,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, notrace int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char old[4], *new; + unsigned char old[MCOUNT_INSN_SIZE], *new; - memcpy(old, &ftrace_call, 4); + memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, (unsigned long)func); return ftrace_modify_code(ip, old, new); } @@ -71,13 +73,13 @@ notrace int ftrace_mcount_set(unsigned long *data) { unsigned long ip = (long)(&mcount_call); unsigned long *addr = data; - unsigned char old[4], *new; + unsigned char old[MCOUNT_INSN_SIZE], *new; /* * Replace the mcount stub with a pointer to the * ip recorder function. */ - memcpy(old, &mcount_call, 4); + memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, *addr); *addr = ftrace_modify_code(ip, old, new); diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c index 8ac0b99f2c5..b80d982a29c 100644 --- a/arch/sparc64/kernel/sparc64_ksyms.c +++ b/arch/sparc64/kernel/sparc64_ksyms.c @@ -53,6 +53,7 @@ #include #include #include +#include struct poll { int fd; @@ -112,7 +113,6 @@ EXPORT_SYMBOL(smp_call_function); #endif /* CONFIG_SMP */ #if defined(CONFIG_MCOUNT) -extern void _mcount(void); EXPORT_SYMBOL(_mcount); #endif diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 04ea83ccb97..95e6bbe3665 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -51,6 +51,7 @@ #include #include #include +#include #include "irq_vectors.h" /* @@ -1118,6 +1119,7 @@ ENTRY(mcount) pushl %ecx pushl %edx movl 0xc(%esp), %eax + subl $MCOUNT_INSN_SIZE, %eax .globl mcount_call mcount_call: @@ -1136,6 +1138,7 @@ ENTRY(ftrace_caller) pushl %edx movl 0xc(%esp), %eax movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax .globl ftrace_call ftrace_call: @@ -1166,6 +1169,7 @@ trace: pushl %edx movl 0xc(%esp), %eax movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax call *ftrace_trace_function diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index fe25e5febca..b0f7308f78a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -51,6 +51,7 @@ #include #include #include +#include .code64 @@ -68,6 +69,7 @@ ENTRY(mcount) movq %r9, 48(%rsp) movq 0x38(%rsp), %rdi + subq $MCOUNT_INSN_SIZE, %rdi .globl mcount_call mcount_call: @@ -99,6 +101,7 @@ ENTRY(ftrace_caller) movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi .globl ftrace_call ftrace_call: @@ -139,6 +142,7 @@ trace: movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi call *ftrace_trace_function diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 55828149e01..ab115cd15fd 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -17,20 +17,21 @@ #include #include +#include -#define CALL_BACK 5 /* Long is fine, even if it is only 4 bytes ;-) */ static long *ftrace_nop; union ftrace_code_union { - char code[5]; + char code[MCOUNT_INSN_SIZE]; struct { char e8; int offset; } __attribute__((packed)); }; + static int notrace ftrace_calc_offset(long ip, long addr) { return (int)(addr - ip); @@ -46,7 +47,7 @@ notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) static union ftrace_code_union calc; calc.e8 = 0xe8; - calc.offset = ftrace_calc_offset(ip, addr); + calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); /* * No locking needed, this must be called via kstop_machine @@ -65,9 +66,6 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char newch = new_code[4]; int faulted = 0; - /* move the IP back to the start of the call */ - ip -= CALL_BACK; - /* * Note: Due to modules and __init, code can * disappear and change, we need to protect against faulting @@ -102,12 +100,10 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, notrace int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char old[5], *new; + unsigned char old[MCOUNT_INSN_SIZE], *new; int ret; - ip += CALL_BACK; - - memcpy(old, &ftrace_call, 5); + memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, (unsigned long)func); ret = ftrace_modify_code(ip, old, new); @@ -118,16 +114,13 @@ notrace int ftrace_mcount_set(unsigned long *data) { unsigned long ip = (long)(&mcount_call); unsigned long *addr = data; - unsigned char old[5], *new; - - /* ip is at the location, but modify code will subtact this */ - ip += CALL_BACK; + unsigned char old[MCOUNT_INSN_SIZE], *new; /* * Replace the mcount stub with a pointer to the * ip recorder function. */ - memcpy(old, &mcount_call, 5); + memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); new = ftrace_call_replace(ip, *addr); *addr = ftrace_modify_code(ip, old, new); @@ -142,8 +135,7 @@ int __init ftrace_dyn_arch_init(void *data) ftrace_mcount_set(data); - ftrace_nop = (unsigned long *)noptable[CALL_BACK]; + ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; return 0; } - diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 29999dbb754..dd7ebee446a 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -1,9 +1,9 @@ -#include #include #include #include #include +#include #ifdef CONFIG_FTRACE /* mcount is defined in assembly */ diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 122885bc5f3..16ff4bf418d 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -1,7 +1,6 @@ /* Exports for assembly files. All C exports should go in the respective C files. */ -#include #include #include @@ -11,6 +10,7 @@ #include #include #include +#include #ifdef CONFIG_FTRACE /* mcount is defined in assembly */ diff --git a/include/asm-arm/ftrace.h b/include/asm-arm/ftrace.h new file mode 100644 index 00000000000..584ef9a8e5a --- /dev/null +++ b/include/asm-arm/ftrace.h @@ -0,0 +1,14 @@ +#ifndef _ASM_ARM_FTRACE +#define _ASM_ARM_FTRACE + +#ifdef CONFIG_FTRACE +#define MCOUNT_ADDR ((long)(mcount)) +#define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ +extern void mcount(void); +#endif + +#endif + +#endif /* _ASM_ARM_FTRACE */ diff --git a/include/asm-powerpc/ftrace.h b/include/asm-powerpc/ftrace.h index b1bfa704b6e..de921326cca 100644 --- a/include/asm-powerpc/ftrace.h +++ b/include/asm-powerpc/ftrace.h @@ -1,6 +1,14 @@ #ifndef _ASM_POWERPC_FTRACE #define _ASM_POWERPC_FTRACE +#ifdef CONFIG_FTRACE +#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ extern void _mcount(void); +#endif #endif + +#endif /* _ASM_POWERPC_FTRACE */ diff --git a/include/asm-sparc64/ftrace.h b/include/asm-sparc64/ftrace.h new file mode 100644 index 00000000000..f76a40a338b --- /dev/null +++ b/include/asm-sparc64/ftrace.h @@ -0,0 +1,14 @@ +#ifndef _ASM_SPARC64_FTRACE +#define _ASM_SPARC64_FTRACE + +#ifdef CONFIG_FTRACE +#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ +extern void _mcount(void); +#endif + +#endif + +#endif /* _ASM_SPARC64_FTRACE */ diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h new file mode 100644 index 00000000000..c184441133f --- /dev/null +++ b/include/asm-x86/ftrace.h @@ -0,0 +1,14 @@ +#ifndef _ASM_X86_FTRACE +#define _ASM_SPARC64_FTRACE + +#ifdef CONFIG_FTRACE +#define MCOUNT_ADDR ((long)(mcount)) +#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ +extern void mcount(void); +#endif + +#endif /* CONFIG_FTRACE */ + +#endif /* _ASM_X86_FTRACE */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 20e14d0093c..366098d591d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -31,7 +31,6 @@ int unregister_ftrace_function(struct ftrace_ops *ops); void clear_ftrace_function(void); extern void ftrace_stub(unsigned long a0, unsigned long a1); -extern void mcount(void); #else /* !CONFIG_FTRACE */ # define register_ftrace_function(ops) do { } while (0) @@ -54,7 +53,7 @@ enum { struct dyn_ftrace { struct hlist_node node; - unsigned long ip; + unsigned long ip; /* address of mcount call-site */ unsigned long flags; }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0d5bcf69952..f1e9e5c74e6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -27,6 +27,8 @@ #include #include +#include + #include "trace.h" /* ftrace_enabled is a method to turn ftrace on or off */ @@ -329,7 +331,6 @@ ftrace_record_ip(unsigned long ip) } #define FTRACE_ADDR ((long)(ftrace_caller)) -#define MCOUNT_ADDR ((long)(mcount)) static int __ftrace_replace_code(struct dyn_ftrace *rec, -- cgit v1.2.3-70-g09d2 From 785656a41f9a9c0e843a23d1ae05d900b5158f8f Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 21 Jun 2008 23:47:39 +0530 Subject: kprobes: enable clean usage of get_kprobe Allow clean use of get_kprobe() outside of core kprobe code. Ftrace makes use of get_kprobe to identify probes installed on mcount call-sites. Signed-off-by: Abhishek Sagar Acked-by: Ananth N Mavinakayanahalli Cc: Masami Hiramatsu Cc: jkenisto@us.ibm.com Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/kprobes.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 1036631ff4f..04a3556bdea 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -259,6 +259,10 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); struct jprobe; struct kretprobe; +static inline struct kprobe *get_kprobe(void *addr) +{ + return NULL; +} static inline struct kprobe *kprobe_running(void) { return NULL; -- cgit v1.2.3-70-g09d2 From ecea656d1d5e912d2f3d332657ea4a6d8380f891 Mon Sep 17 00:00:00 2001 From: Abhishek Sagar Date: Sat, 21 Jun 2008 23:47:53 +0530 Subject: ftrace: freeze kprobe'd records Let records identified as being kprobe'd be marked as "frozen". The trouble with records which have a kprobe installed on their mcount call-site is that they don't get updated. So if such a function which is currently being traced gets its tracing disabled due to a new filter rule (or because it was added to the notrace list) then it won't be updated and continue being traced. This patch allows scanning of all frozen records during tracing to check if they should be traced. Signed-off-by: Abhishek Sagar Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 6 ++++- kernel/trace/ftrace.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace.c | 3 +++ 3 files changed, 79 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 366098d591d..3121b95443d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -49,6 +49,7 @@ enum { FTRACE_FL_ENABLED = (1 << 3), FTRACE_FL_NOTRACE = (1 << 4), FTRACE_FL_CONVERTED = (1 << 5), + FTRACE_FL_FROZEN = (1 << 6), }; struct dyn_ftrace { @@ -73,15 +74,18 @@ extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); +extern int skip_trace(unsigned long ip); + void ftrace_disable_daemon(void); void ftrace_enable_daemon(void); #else +# define skip_trace(ip) ({ 0; }) # define ftrace_force_update() ({ 0; }) # define ftrace_set_filter(buf, len, reset) do { } while (0) # define ftrace_disable_daemon() do { } while (0) # define ftrace_enable_daemon() do { } while (0) -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ void ftrace_kill(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f1e9e5c74e6..d1238163155 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -163,6 +163,8 @@ enum { }; static int ftrace_filtered; +static int tracing_on; +static int frozen_record_count; static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; @@ -195,6 +197,71 @@ static int ftrace_record_suspend; static struct dyn_ftrace *ftrace_free_records; + +#ifdef CONFIG_KPROBES +static inline void freeze_record(struct dyn_ftrace *rec) +{ + if (!(rec->flags & FTRACE_FL_FROZEN)) { + rec->flags |= FTRACE_FL_FROZEN; + frozen_record_count++; + } +} + +static inline void unfreeze_record(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_FROZEN) { + rec->flags &= ~FTRACE_FL_FROZEN; + frozen_record_count--; + } +} + +static inline int record_frozen(struct dyn_ftrace *rec) +{ + return rec->flags & FTRACE_FL_FROZEN; +} +#else +# define freeze_record(rec) ({ 0; }) +# define unfreeze_record(rec) ({ 0; }) +# define record_frozen(rec) ({ 0; }) +#endif /* CONFIG_KPROBES */ + +int skip_trace(unsigned long ip) +{ + unsigned long fl; + struct dyn_ftrace *rec; + struct hlist_node *t; + struct hlist_head *head; + + if (frozen_record_count == 0) + return 0; + + head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)]; + hlist_for_each_entry_rcu(rec, t, head, node) { + if (rec->ip == ip) { + if (record_frozen(rec)) { + if (rec->flags & FTRACE_FL_FAILED) + return 1; + + if (!(rec->flags & FTRACE_FL_CONVERTED)) + return 1; + + if (!tracing_on || !ftrace_enabled) + return 1; + + if (ftrace_filtered) { + fl = rec->flags & (FTRACE_FL_FILTER | + FTRACE_FL_NOTRACE); + if (!fl || (fl & FTRACE_FL_NOTRACE)) + return 1; + } + } + break; + } + } + + return 0; +} + static inline int ftrace_ip_in_hash(unsigned long ip, unsigned long key) { @@ -489,8 +556,11 @@ static int __ftrace_modify_code(void *data) */ __ftrace_update_code(NULL); ftrace_replace_code(1); - } else if (*command & FTRACE_DISABLE_CALLS) + tracing_on = 1; + } else if (*command & FTRACE_DISABLE_CALLS) { ftrace_replace_code(0); + tracing_on = 0; + } if (*command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6e9dae7eb41..9ade79369bf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -988,6 +988,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) if (unlikely(!tracer_enabled)) return; + if (skip_trace(ip)) + return; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; -- cgit v1.2.3-70-g09d2 From 3da757daf86e498872855f0b5e101f763ba79499 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 20 Jun 2008 15:06:33 -0700 Subject: x86: use cpu_khz for loops_per_jiffy calculation On the x86 platform we can use the value of tsc_khz computed during tsc calibration to calculate the loops_per_jiffy value. Its very important to keep the error in lpj values to minimum as any error in that may result in kernel panic in check_timer. In virtualization environment, On a highly overloaded host the guest delay calibration may sometimes result in errors beyond the ~50% that timer_irq_works can handle, resulting in the guest panicking. Does some formating changes to lpj_setup code to now have a single printk to print the bogomips value. We do this only for the boot processor because the AP's can have different base frequencies or the BIOS might boot a AP at a different frequency. Signed-off-by: Alok N Kataria Cc: Arjan van de Ven Cc: Daniel Hecht Cc: Tim Mann Cc: Zach Amsden Cc: Sahil Rihan Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 2 ++ arch/x86/kernel/tsc_32.c | 5 +++++ include/linux/delay.h | 1 + init/calibrate.c | 36 +++++++++++++++++++----------------- 4 files changed, 27 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index c737849e2ef..12b4a71bd07 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -123,6 +123,8 @@ void __init time_init(void) (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) cpu_khz = calculate_cpu_khz(); + lpj_tsc = ((unsigned long)tsc_khz * 1000)/HZ; + if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 068759db63d..be729035b30 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -401,6 +401,7 @@ static inline void check_geode_tsc_reliable(void) { } void __init tsc_init(void) { int cpu; + u64 lpj; if (!cpu_has_tsc || tsc_disabled) { /* Disable the TSC in case of !cpu_has_tsc */ @@ -421,6 +422,10 @@ void __init tsc_init(void) return; } + lpj = ((u64)tsc_khz * 1000); + do_div(lpj, HZ); + lpj_tsc = lpj; + printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); diff --git a/include/linux/delay.h b/include/linux/delay.h index 54552d21296..01aec60590a 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -41,6 +41,7 @@ static inline void ndelay(unsigned long x) #define ndelay(x) ndelay(x) #endif +extern unsigned long lpj_tsc; void calibrate_delay(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); diff --git a/init/calibrate.c b/init/calibrate.c index ecb3822d4f7..86286974dad 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -8,7 +8,9 @@ #include #include #include +#include +unsigned long lpj_tsc; unsigned long preset_lpj; static int __init lpj_setup(char *str) { @@ -108,6 +110,10 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;} * This is the number of bits of precision for the loops_per_jiffy. Each * bit takes on average 1.5/HZ seconds. This (like the original) is a little * better than 1% + * For the boot cpu we can skip the delay calibration and assign it a value + * calculated based on the tsc frequency. + * For the rest of the CPUs we cannot assume that the tsc frequency is same as + * the cpu frequency, hence do the calibration for those. */ #define LPS_PREC 8 @@ -118,20 +124,20 @@ void __cpuinit calibrate_delay(void) if (preset_lpj) { loops_per_jiffy = preset_lpj; - printk("Calibrating delay loop (skipped)... " - "%lu.%02lu BogoMIPS preset\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100); + printk(KERN_INFO + "Calibrating delay loop (skipped) preset value.. "); + } else if ((smp_processor_id() == 0) && lpj_tsc) { + loops_per_jiffy = lpj_tsc; + printk(KERN_INFO + "Calibrating delay loop (skipped), " + "using tsc calculated value.. "); } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { - printk("Calibrating delay using timer specific routine.. "); - printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100, - loops_per_jiffy); + printk(KERN_INFO + "Calibrating delay using timer specific routine.. "); } else { loops_per_jiffy = (1<<12); - printk(KERN_DEBUG "Calibrating delay loop... "); + printk(KERN_INFO "Calibrating delay loop... "); while ((loops_per_jiffy <<= 1) != 0) { /* wait for "start of" clock tick */ ticks = jiffies; @@ -161,12 +167,8 @@ void __cpuinit calibrate_delay(void) if (jiffies != ticks) /* longer than 1 tick */ loops_per_jiffy &= ~loopbit; } - - /* Round the value and print it */ - printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100, - loops_per_jiffy); } - + printk(KERN_INFO "%lu.%02lu BogoMIPS (lpj=%lu)\n", + loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy); } -- cgit v1.2.3-70-g09d2 From 961ccddd59d627b89bd3dc284b6517833bbdf25d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 23 Jun 2008 13:55:38 +1000 Subject: sched: add new API sched_setscheduler_nocheck: add a flag to control access checks Hidehiro Kawai noticed that sched_setscheduler() can fail in stop_machine: it calls sched_setscheduler() from insmod, which can have CAP_SYS_MODULE without CAP_SYS_NICE. Two cases could have failed, so are changed to sched_setscheduler_nocheck: kernel/softirq.c:cpu_callback() - CPU hotplug callback kernel/stop_machine.c:__stop_machine_run() - Called from various places, including modprobe() Signed-off-by: Rusty Russell Cc: Jeremy Fitzhardinge Cc: Hidehiro Kawai Cc: Andrew Morton Cc: linux-mm@kvack.org Cc: sugita Cc: Satoshi OSHIMA Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched.c | 48 ++++++++++++++++++++++++++++++++++++------------ kernel/softirq.c | 2 +- kernel/stop_machine.c | 2 +- 4 files changed, 40 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c5d3f847ca8..fe3b9b5d739 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1655,6 +1655,8 @@ extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); +extern int sched_setscheduler_nocheck(struct task_struct *, int, + struct sched_param *); extern struct task_struct *idle_task(int cpu); extern struct task_struct *curr_task(int cpu); extern void set_curr_task(int cpu, struct task_struct *p); diff --git a/kernel/sched.c b/kernel/sched.c index b048ad8a11a..8d7c246ab86 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4746,16 +4746,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) set_load_weight(p); } -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) +static int __sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param, bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; @@ -4787,7 +4779,7 @@ recheck: /* * Allow unprivileged RT tasks to decrease priority: */ - if (!capable(CAP_SYS_NICE)) { + if (user && !capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { unsigned long rlim_rtprio; @@ -4823,7 +4815,8 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + if (user + && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -4872,8 +4865,39 @@ recheck: return 0; } + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, true); +} EXPORT_SYMBOL_GPL(sched_setscheduler); +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, false); +} + static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { diff --git a/kernel/softirq.c b/kernel/softirq.c index 36e06174004..afd9120c2fc 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -645,7 +645,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, p = per_cpu(ksoftirqd, hotcpu); per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler(p, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); kthread_stop(p); takeover_tasklets(hotcpu); break; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b7350bbfb07..ba9b2054ecb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -187,7 +187,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; /* One high-prio thread per cpu. We'll do this one. */ - sched_setscheduler(p, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); kthread_bind(p, cpu); wake_up_process(p); wait_for_completion(&smdata.done); -- cgit v1.2.3-70-g09d2 From a033c332e047397904ed74816946b2edd9b0d5cd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 23 Jun 2008 10:52:42 +0800 Subject: lockdep: remove duplicate definition of STATIC_LOCKDEP_MAP_INIT STATIC_LOCKDEP_MAP_INIT is defined twice in lockdep.h. I guess it's a copy & paste. Signed-off-by: Li Zefan Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b26fbc715a5..2486eb4edbf 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -278,14 +278,6 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name, lockdep_init_map(&(lock)->dep_map, #lock, \ (lock)->dep_map.key, sub) -/* - * To initialize a lockdep_map statically use this macro. - * Note that _name must not be NULL. - */ -#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ - { .name = (_name), .key = (void *)(_key), } - - /* * Acquire a lock. * -- cgit v1.2.3-70-g09d2 From f3f3149f35b9195ef4b761b1353fc0766b5f53be Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 23 Jun 2008 18:21:56 -0700 Subject: x86: use cpu_khz for loops_per_jiffy calculation, cleanup As suggested by Ingo, remove all references to tsc from init/calibrate.c TSC is x86 specific, and using tsc in variable names in a generic file should be avoided. lpj_tsc is now called lpj_fine, since it is related to fine tuning of lpj value. Also tsc_rate_* is called timer_rate_* Signed-off-by: Alok N Kataria Cc: Arjan van de Ven Cc: Daniel Hecht Cc: Tim Mann Cc: Zach Amsden Cc: Sahil Rihan Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 2 +- arch/x86/kernel/tsc_32.c | 2 +- include/linux/delay.h | 2 +- init/calibrate.c | 36 +++++++++++++++++++----------------- 4 files changed, 22 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 12b4a71bd07..39ae8511a13 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -123,7 +123,7 @@ void __init time_init(void) (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) cpu_khz = calculate_cpu_khz(); - lpj_tsc = ((unsigned long)tsc_khz * 1000)/HZ; + lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ; if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 0af49fb533e..048baab7726 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -425,7 +425,7 @@ void __init tsc_init(void) lpj = ((u64)tsc_khz * 1000); do_div(lpj, HZ); - lpj_tsc = lpj; + lpj_fine = lpj; printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, diff --git a/include/linux/delay.h b/include/linux/delay.h index 01aec60590a..fd832c6d419 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -41,7 +41,7 @@ static inline void ndelay(unsigned long x) #define ndelay(x) ndelay(x) #endif -extern unsigned long lpj_tsc; +extern unsigned long lpj_fine; void calibrate_delay(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); diff --git a/init/calibrate.c b/init/calibrate.c index 86286974dad..7963e3fc51d 100644 --- a/init/calibrate.c +++ b/init/calibrate.c @@ -10,7 +10,7 @@ #include #include -unsigned long lpj_tsc; +unsigned long lpj_fine; unsigned long preset_lpj; static int __init lpj_setup(char *str) { @@ -35,9 +35,9 @@ static unsigned long __cpuinit calibrate_delay_direct(void) unsigned long pre_start, start, post_start; unsigned long pre_end, end, post_end; unsigned long start_jiffies; - unsigned long tsc_rate_min, tsc_rate_max; - unsigned long good_tsc_sum = 0; - unsigned long good_tsc_count = 0; + unsigned long timer_rate_min, timer_rate_max; + unsigned long good_timer_sum = 0; + unsigned long good_timer_count = 0; int i; if (read_current_timer(&pre_start) < 0 ) @@ -81,22 +81,24 @@ static unsigned long __cpuinit calibrate_delay_direct(void) } read_current_timer(&post_end); - tsc_rate_max = (post_end - pre_start) / DELAY_CALIBRATION_TICKS; - tsc_rate_min = (pre_end - post_start) / DELAY_CALIBRATION_TICKS; + timer_rate_max = (post_end - pre_start) / + DELAY_CALIBRATION_TICKS; + timer_rate_min = (pre_end - post_start) / + DELAY_CALIBRATION_TICKS; /* - * If the upper limit and lower limit of the tsc_rate is + * If the upper limit and lower limit of the timer_rate is * >= 12.5% apart, redo calibration. */ if (pre_start != 0 && pre_end != 0 && - (tsc_rate_max - tsc_rate_min) < (tsc_rate_max >> 3)) { - good_tsc_count++; - good_tsc_sum += tsc_rate_max; + (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) { + good_timer_count++; + good_timer_sum += timer_rate_max; } } - if (good_tsc_count) - return (good_tsc_sum/good_tsc_count); + if (good_timer_count) + return (good_timer_sum/good_timer_count); printk(KERN_WARNING "calibrate_delay_direct() failed to get a good " "estimate for loops_per_jiffy.\nProbably due to long platform interrupts. Consider using \"lpj=\" boot option.\n"); @@ -111,8 +113,8 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;} * bit takes on average 1.5/HZ seconds. This (like the original) is a little * better than 1% * For the boot cpu we can skip the delay calibration and assign it a value - * calculated based on the tsc frequency. - * For the rest of the CPUs we cannot assume that the tsc frequency is same as + * calculated based on the timer frequency. + * For the rest of the CPUs we cannot assume that the timer frequency is same as * the cpu frequency, hence do the calibration for those. */ #define LPS_PREC 8 @@ -126,11 +128,11 @@ void __cpuinit calibrate_delay(void) loops_per_jiffy = preset_lpj; printk(KERN_INFO "Calibrating delay loop (skipped) preset value.. "); - } else if ((smp_processor_id() == 0) && lpj_tsc) { - loops_per_jiffy = lpj_tsc; + } else if ((smp_processor_id() == 0) && lpj_fine) { + loops_per_jiffy = lpj_fine; printk(KERN_INFO "Calibrating delay loop (skipped), " - "using tsc calculated value.. "); + "value calculated using timer frequency.. "); } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { printk(KERN_INFO "Calibrating delay using timer specific routine.. "); -- cgit v1.2.3-70-g09d2 From d8de72473effd674a3c1fe9621821f406f5587c9 Mon Sep 17 00:00:00 2001 From: Peng Haitao Date: Tue, 20 May 2008 09:13:02 +0800 Subject: [PATCH] remove useless argument type in audit_filter_user() The second argument "type" is not used in audit_filter_user(), so I think that type can be removed. If I'm wrong, please tell me. Signed-off-by: Peng Haitao Signed-off-by: Al Viro --- include/linux/audit.h | 2 +- kernel/audit.c | 2 +- kernel/auditfilter.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 63c3bb98558..8b82974bdc1 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -571,7 +571,7 @@ extern void audit_log_lost(const char *message); extern int audit_update_lsm_rules(void); /* Private API (for audit.c only) */ -extern int audit_filter_user(struct netlink_skb_parms *cb, int type); +extern int audit_filter_user(struct netlink_skb_parms *cb); extern int audit_filter_type(int type); extern int audit_receive_filter(int type, int pid, int uid, int seq, void *data, size_t datasz, uid_t loginuid, diff --git a/kernel/audit.c b/kernel/audit.c index 56f30287e24..e092f1c0ce3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -738,7 +738,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(&NETLINK_CB(skb), msg_type); + err = audit_filter_user(&NETLINK_CB(skb)); if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 75cdf262851..98c50cc671b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1721,7 +1721,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, return 1; } -int audit_filter_user(struct netlink_skb_parms *cb, int type) +int audit_filter_user(struct netlink_skb_parms *cb) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; -- cgit v1.2.3-70-g09d2 From 16d752397301b95abaa95cbaf9e785d221872311 Mon Sep 17 00:00:00 2001 From: Rene Herman Date: Tue, 24 Jun 2008 19:38:56 +0200 Subject: thermal: Create CONFIG_THERMAL_HWMON=n A bug in libsensors <= 2.10.6 is exposed when this new hwmon I/F is enabled. Create CONFIG_THERMAL_HWMON=n until some time after libsensors 2.10.7 ships so those users can run the latest kernel. libsensors 3.x is already fixed -- those users can use CONFIG_THERMAL_HWMON=y now. Signed-off-by: Rene Herman Acked-by: Mark M. Hoffman Signed-off-by: Len Brown --- Documentation/feature-removal-schedule.txt | 9 +++++++++ drivers/thermal/Kconfig | 9 +++++++++ drivers/thermal/thermal_sys.c | 4 ++-- include/linux/thermal.h | 6 ++---- 4 files changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 5b3f31faed5..46ece3fba6f 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -312,3 +312,12 @@ When: 2.6.26 Why: Implementation became generic; users should now include linux/semaphore.h instead. Who: Matthew Wilcox + +--------------------------- + +What: CONFIG_THERMAL_HWMON +When: January 2009 +Why: This option was introduced just to allow older lm-sensors userspace + to keep working over the upgrade to 2.6.26. At the scheduled time of + removal fixed lm-sensors (2.x or 3.x) should be readily available. +Who: Rene Herman diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 4b628526df0..a86e952ed4c 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -12,3 +12,12 @@ menuconfig THERMAL cooling devices. All platforms with ACPI thermal support can use this driver. If you want this support, you should say Y or M here. + +config THERMAL_HWMON + bool "Hardware monitoring support" + depends on HWMON=y || HWMON=THERMAL + help + The generic thermal sysfs driver's hardware monitoring support + requires a 2.10.7/3.0.2 or later lm-sensors userspace. + + Say Y if your user-space is new enough. diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c index 6098787341f..fe07462d594 100644 --- a/drivers/thermal/thermal_sys.c +++ b/drivers/thermal/thermal_sys.c @@ -295,8 +295,8 @@ thermal_cooling_device_trip_point_show(struct device *dev, /* Device management */ -#if defined(CONFIG_HWMON) || \ - (defined(CONFIG_HWMON_MODULE) && defined(CONFIG_THERMAL_MODULE)) +#if defined(CONFIG_THERMAL_HWMON) + /* hwmon sys I/F */ #include static LIST_HEAD(thermal_hwmon_list); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 06d3e6eb9ca..917707e6151 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -66,8 +66,7 @@ struct thermal_cooling_device { ((long)t-2732+5)/10 : ((long)t-2732-5)/10) #define CELSIUS_TO_KELVIN(t) ((t)*10+2732) -#if defined(CONFIG_HWMON) || \ - (defined(CONFIG_HWMON_MODULE) && defined(CONFIG_THERMAL_MODULE)) +#if defined(CONFIG_THERMAL_HWMON) /* thermal zone devices with the same type share one hwmon device */ struct thermal_hwmon_device { char type[THERMAL_NAME_LENGTH]; @@ -94,8 +93,7 @@ struct thermal_zone_device { struct idr idr; struct mutex lock; /* protect cooling devices list */ struct list_head node; -#if defined(CONFIG_HWMON) || \ - (defined(CONFIG_HWMON_MODULE) && defined(CONFIG_THERMAL_MODULE)) +#if defined(CONFIG_THERMAL_HWMON) struct list_head hwmon_node; struct thermal_hwmon_device *hwmon; struct thermal_hwmon_attr temp_input; /* hwmon sys attr */ -- cgit v1.2.3-70-g09d2 From 1bdad606338debc6384b2844f1b53cc436b3ac90 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 3 Jun 2008 14:09:53 +0100 Subject: [GFS2] Remove remote lock dropping code There are several reasons why this is undesirable: 1. It never happens during normal operation anyway 2. If it does happen it causes performance to be very, very poor 3. It isn't likely to solve the original problem (memory shortage on remote DLM node) it was supposed to solve 4. It uses a bunch of arbitrary constants which are unlikely to be correct for any particular situation and for which the tuning seems to be a black art. 5. In an N node cluster, only 1/N of the dropped locked will actually contribute to solving the problem on average. So all in all we are better off without it. This also makes merging the lock_dlm module into GFS2 a bit easier. Signed-off-by: Steven Whitehouse --- fs/gfs2/gfs2.h | 5 ----- fs/gfs2/glock.c | 12 +++--------- fs/gfs2/glock.h | 2 +- fs/gfs2/locking/dlm/lock_dlm.h | 3 --- fs/gfs2/locking/dlm/mount.c | 3 --- fs/gfs2/locking/dlm/sysfs.c | 13 ------------- fs/gfs2/locking/dlm/thread.c | 19 ------------------- fs/gfs2/ops_fstype.c | 2 +- fs/gfs2/ops_super.c | 2 +- fs/gfs2/sys.c | 14 -------------- include/linux/lm_interface.h | 4 ---- 11 files changed, 6 insertions(+), 73 deletions(-) (limited to 'include/linux') diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h index 3bb11c0f8b5..ef606e3a5cf 100644 --- a/fs/gfs2/gfs2.h +++ b/fs/gfs2/gfs2.h @@ -15,11 +15,6 @@ enum { CREATE = 1, }; -enum { - NO_WAIT = 0, - WAIT = 1, -}; - enum { NO_FORCE = 0, FORCE = 1, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index be7ed503f01..8d5450f3c3e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1316,11 +1316,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) wake_up_process(sdp->sd_recoverd_process); return; - case LM_CB_DROPLOCKS: - gfs2_gl_hash_clear(sdp, NO_WAIT); - gfs2_quota_scan(sdp); - return; - default: gfs2_assert_warn(sdp, 0); return; @@ -1508,11 +1503,10 @@ static void clear_glock(struct gfs2_glock *gl) * @sdp: the filesystem * @wait: wait until it's all gone * - * Called when unmounting the filesystem, or when inter-node lock manager - * requests DROPLOCKS because it is running out of capacity. + * Called when unmounting the filesystem. */ -void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { unsigned long t; unsigned int x; @@ -1527,7 +1521,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) cont = 1; } - if (!wait || !cont) + if (!cont) break; if (time_after_eq(jiffies, diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 7389f8ef0a3..971d92af70f 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -132,7 +132,7 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl); void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); void gfs2_reclaim_glock(struct gfs2_sbd *sdp); -void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait); +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); int __init gfs2_glock_init(void); void gfs2_glock_exit(void); diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h index ad944c64eab..845a27fd303 100644 --- a/fs/gfs2/locking/dlm/lock_dlm.h +++ b/fs/gfs2/locking/dlm/lock_dlm.h @@ -79,9 +79,6 @@ struct gdlm_ls { wait_queue_head_t wait_control; struct task_struct *thread; wait_queue_head_t thread_wait; - unsigned long drop_time; - int drop_locks_count; - int drop_locks_period; }; enum { diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c index 0628520a445..fa31c54c2e6 100644 --- a/fs/gfs2/locking/dlm/mount.c +++ b/fs/gfs2/locking/dlm/mount.c @@ -22,8 +22,6 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp, if (!ls) return NULL; - ls->drop_locks_count = GDLM_DROP_COUNT; - ls->drop_locks_period = GDLM_DROP_PERIOD; ls->fscb = cb; ls->sdp = sdp; ls->fsflags = flags; @@ -33,7 +31,6 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp, INIT_LIST_HEAD(&ls->all_locks); init_waitqueue_head(&ls->thread_wait); init_waitqueue_head(&ls->wait_control); - ls->drop_time = jiffies; ls->jid = -1; strncpy(buf, table_name, 256); diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c index a4ff271df9e..4ec571c3d8a 100644 --- a/fs/gfs2/locking/dlm/sysfs.c +++ b/fs/gfs2/locking/dlm/sysfs.c @@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf) return sprintf(buf, "%d\n", ls->recover_jid_status); } -static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf) -{ - return sprintf(buf, "%d\n", ls->drop_locks_count); -} - -static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len) -{ - ls->drop_locks_count = simple_strtol(buf, NULL, 0); - return len; -} - struct gdlm_attr { struct attribute attr; ssize_t (*show)(struct gdlm_ls *, char *); @@ -144,7 +133,6 @@ GDLM_ATTR(first_done, 0444, first_done_show, NULL); GDLM_ATTR(recover, 0644, recover_show, recover_store); GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); -GDLM_ATTR(drop_count, 0644, drop_count_show, drop_count_store); static struct attribute *gdlm_attrs[] = { &gdlm_attr_proto_name.attr, @@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = { &gdlm_attr_recover.attr, &gdlm_attr_recover_done.attr, &gdlm_attr_recover_status.attr, - &gdlm_attr_drop_count.attr, NULL, }; diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c index f30350abd62..38823efd698 100644 --- a/fs/gfs2/locking/dlm/thread.c +++ b/fs/gfs2/locking/dlm/thread.c @@ -20,19 +20,6 @@ static inline int no_work(struct gdlm_ls *ls) return ret; } -static inline int check_drop(struct gdlm_ls *ls) -{ - if (!ls->drop_locks_count) - return 0; - - if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) { - ls->drop_time = jiffies; - if (ls->all_locks_count >= ls->drop_locks_count) - return 1; - } - return 0; -} - static int gdlm_thread(void *data) { struct gdlm_ls *ls = (struct gdlm_ls *) data; @@ -52,12 +39,6 @@ static int gdlm_thread(void *data) gdlm_do_lock(lp); spin_lock(&ls->async_lock); } - /* Does this ever happen these days? I hope not anyway */ - if (check_drop(ls)) { - spin_unlock(&ls->async_lock); - ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL); - spin_lock(&ls->async_lock); - } spin_unlock(&ls->async_lock); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 9bd97c5543b..6ba69dd1a72 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -874,7 +874,7 @@ fail_sb: fail_locking: init_locking(sdp, &mount_gh, UNDO); fail_lm: - gfs2_gl_hash_clear(sdp, WAIT); + gfs2_gl_hash_clear(sdp); gfs2_lm_unmount(sdp); while (invalidate_inodes(sb)) yield(); diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 66907922109..f66ea0f7a35 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -126,7 +126,7 @@ static void gfs2_put_super(struct super_block *sb) gfs2_clear_rgrpd(sdp); gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ - gfs2_gl_hash_clear(sdp, WAIT); + gfs2_gl_hash_clear(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 9ab9fc85ecd..6f7e2e5858e 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, return len; } -static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (simple_strtol(buf, NULL, 0) != 1) - return -EINVAL; - - gfs2_gl_hash_clear(sdp, NO_WAIT); - return len; -} - static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { @@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) GFS2_ATTR(id, 0444, id_show, NULL); GFS2_ATTR(fsname, 0444, fsname_show, NULL); GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); -GFS2_ATTR(shrink, 0200, NULL, shrink_store); GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); @@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = { &gfs2_attr_id.attr, &gfs2_attr_fsname.attr, &gfs2_attr_freeze.attr, - &gfs2_attr_shrink.attr, &gfs2_attr_withdraw.attr, &gfs2_attr_statfs_sync.attr, &gfs2_attr_quota_sync.attr, diff --git a/include/linux/lm_interface.h b/include/linux/lm_interface.h index f274997bc28..d0a7112b971 100644 --- a/include/linux/lm_interface.h +++ b/include/linux/lm_interface.h @@ -138,9 +138,6 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data); * LM_CB_NEED_RECOVERY * The given journal needs to be recovered. * - * LM_CB_DROPLOCKS - * Reduce the number of cached locks. - * * LM_CB_ASYNC * The given lock has been granted. */ @@ -149,7 +146,6 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data); #define LM_CB_NEED_D 258 #define LM_CB_NEED_S 259 #define LM_CB_NEED_RECOVERY 260 -#define LM_CB_DROPLOCKS 261 #define LM_CB_ASYNC 262 /* -- cgit v1.2.3-70-g09d2 From b2cad26cfc2091050574a460b304ed103a35dbda Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 3 Jun 2008 14:34:14 +0100 Subject: [GFS2] Remove obsolete conversion deadlock avoidance code This is only used by GFS1 so can be removed. Signed-off-by: Steven Whitehouse --- fs/gfs2/locking/dlm/lock.c | 23 +---------------------- include/linux/lm_interface.h | 2 -- 2 files changed, 1 insertion(+), 24 deletions(-) (limited to 'include/linux') diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c index 871ffc9578f..894df4567a0 100644 --- a/fs/gfs2/locking/dlm/lock.c +++ b/fs/gfs2/locking/dlm/lock.c @@ -80,7 +80,6 @@ static void process_complete(struct gdlm_lock *lp) { struct gdlm_ls *ls = lp->ls; struct lm_async_cb acb; - s16 prev_mode = lp->cur; memset(&acb, 0, sizeof(acb)); @@ -160,15 +159,7 @@ static void process_complete(struct gdlm_lock *lp) lp->lksb.sb_status, lp->lockname.ln_type, (unsigned long long)lp->lockname.ln_number, lp->flags); - if (lp->lksb.sb_status == -EDEADLOCK && - lp->ls->fsflags & LM_MFLAG_CONV_NODROP) { - lp->req = lp->cur; - acb.lc_ret |= LM_OUT_CONV_DEADLK; - if (lp->cur == DLM_LOCK_IV) - lp->lksb.sb_lkid = 0; - goto out; - } else - return; + return; } /* @@ -268,10 +259,6 @@ out: acb.lc_name = lp->lockname; acb.lc_ret |= gdlm_make_lmstate(lp->cur); - if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) && - (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL)) - acb.lc_ret |= LM_OUT_CACHEABLE; - ls->fscb(ls->sdp, LM_CB_ASYNC, &acb); } @@ -376,14 +363,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp, if (lp->lksb.sb_lkid != 0) { lkf |= DLM_LKF_CONVERT; - - /* Conversion deadlock avoidance by DLM */ - - if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) && - !test_bit(LFL_FORCE_PROMOTE, &lp->flags) && - !(lkf & DLM_LKF_NOQUEUE) && - cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req) - lkf |= DLM_LKF_CONVDEADLK; } if (lp->lvb) diff --git a/include/linux/lm_interface.h b/include/linux/lm_interface.h index d0a7112b971..2ed8fa1b762 100644 --- a/include/linux/lm_interface.h +++ b/include/linux/lm_interface.h @@ -122,11 +122,9 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data); */ #define LM_OUT_ST_MASK 0x00000003 -#define LM_OUT_CACHEABLE 0x00000004 #define LM_OUT_CANCELED 0x00000008 #define LM_OUT_ASYNC 0x00000080 #define LM_OUT_ERROR 0x00000100 -#define LM_OUT_CONV_DEADLK 0x00000200 /* * lm_callback_t types -- cgit v1.2.3-70-g09d2 From c09595f63bb1909c5dc4dca288f4fe818561b5f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:14 +0200 Subject: sched: revert revert of: fair-group: SMP-nice for group scheduling Try again.. Initial commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e Revert: 6363ca57c76b7b83639ca8c83fc285fa26a7880e Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched.c | 430 ++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched_debug.c | 5 + kernel/sched_fair.c | 124 +++++++++------ kernel/sched_rt.c | 4 + 5 files changed, 489 insertions(+), 75 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index eaf821072db..97a58b622ee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -765,6 +765,7 @@ struct sched_domain { struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ cpumask_t span; /* span of all CPUs in this domain */ + int first_cpu; /* cache of the first cpu in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ diff --git a/kernel/sched.c b/kernel/sched.c index f653af684fb..874b6da1543 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -403,6 +403,43 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + unsigned long task_weight; + unsigned long shares; + /* + * We need space to build a sched_domain wide view of the full task + * group tree, in order to avoid depending on dynamic memory allocation + * during the load balancing we place this in the per cpu task group + * hierarchy. This limits the load balancing to one instance per cpu, + * but more should not be needed anyway. + */ + struct aggregate_struct { + /* + * load = weight(cpus) * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long load; + + /* + * part of the group weight distributed to this span. + */ + unsigned long shares; + + /* + * The sum of all runqueue weights within this span. + */ + unsigned long rq_weight; + + /* + * Weight contributed by tasks; this is the part we can + * influence by moving tasks around. + */ + unsigned long task_weight; + } aggregate; +#endif #endif }; @@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* + * Group load balancing. + * + * We calculate a few balance domain wide aggregate numbers; load and weight. + * Given the pictures below, and assuming each item has equal weight: + * + * root 1 - thread + * / | \ A - group + * A 1 B + * /|\ / \ + * C 2 D 3 4 + * | | + * 5 6 + * + * load: + * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, + * which equals 1/9-th of the total load. + * + * shares: + * The weight of this group on the selected cpus. + * + * rq_weight: + * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while + * B would get 2. + * + * task_weight: + * Part of the rq_weight contributed by tasks; all groups except B would + * get 1, B gets 2. + */ + +static inline struct aggregate_struct * +aggregate(struct task_group *tg, struct sched_domain *sd) +{ + return &tg->cfs_rq[sd->first_cpu]->aggregate; +} + +typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + */ +static +void aggregate_walk_tree(aggregate_func down, aggregate_func up, + struct sched_domain *sd) +{ + struct task_group *parent, *child; + + rcu_read_lock(); + parent = &root_task_group; +down: + (*down)(parent, sd); + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + (*up)(parent, sd); + + child = parent; + parent = parent->parent; + if (parent) + goto up; + rcu_read_unlock(); +} + +/* + * Calculate the aggregate runqueue weight. + */ +static +void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long rq_weight = 0; + unsigned long task_weight = 0; + int i; + + for_each_cpu_mask(i, sd->span) { + rq_weight += tg->cfs_rq[i]->load.weight; + task_weight += tg->cfs_rq[i]->task_weight; + } + + aggregate(tg, sd)->rq_weight = rq_weight; + aggregate(tg, sd)->task_weight = task_weight; +} + +/* + * Compute the weight of this group on the given cpus. + */ +static +void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = 0; + int i; + + for_each_cpu_mask(i, sd->span) + shares += tg->cfs_rq[i]->shares; + + if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) + shares = tg->shares; + + aggregate(tg, sd)->shares = shares; +} + +/* + * Compute the load fraction assigned to this group, relies on the aggregate + * weight and this group's parent's load, i.e. top-down. + */ +static +void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long load; + + if (!tg->parent) { + int i; + + load = 0; + for_each_cpu_mask(i, sd->span) + load += cpu_rq(i)->load.weight; + + } else { + load = aggregate(tg->parent, sd)->load; + + /* + * shares is our weight in the parent's rq so + * shares/parent->rq_weight gives our fraction of the load + */ + load *= aggregate(tg, sd)->shares; + load /= aggregate(tg->parent, sd)->rq_weight + 1; + } + + aggregate(tg, sd)->load = load; +} + +static void __set_se_shares(struct sched_entity *se, unsigned long shares); + +/* + * Calculate and set the cpu's group shares. + */ +static void +__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, + int tcpu) +{ + int boost = 0; + unsigned long shares; + unsigned long rq_weight; + + if (!tg->se[tcpu]) + return; + + rq_weight = tg->cfs_rq[tcpu]->load.weight; + + /* + * If there are currently no tasks on the cpu pretend there is one of + * average load so that when a new task gets to run here it will not + * get delayed by group starvation. + */ + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } + + /* + * \Sum shares * rq_weight + * shares = ----------------------- + * \Sum rq_weight + * + */ + shares = aggregate(tg, sd)->shares * rq_weight; + shares /= aggregate(tg, sd)->rq_weight + 1; + + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; + + __set_se_shares(tg->se[tcpu], shares); +} + +/* + * Re-adjust the weights on the cpu the task came from and on the cpu the + * task went to. + */ +static void +__move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + unsigned long shares; + + shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + + __update_group_shares_cpu(tg, sd, scpu); + __update_group_shares_cpu(tg, sd, dcpu); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + if (shares) + tg->cfs_rq[dcpu]->shares += shares; +} + +/* + * Because changing a group's shares changes the weight of the super-group + * we need to walk up the tree and change all shares until we hit the root. + */ +static void +move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + while (tg) { + __move_group_shares(tg, sd, scpu, dcpu); + tg = tg->parent; + } +} + +static +void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = aggregate(tg, sd)->shares; + int i; + + for_each_cpu_mask(i, sd->span) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __update_group_shares_cpu(tg, sd, i); + spin_unlock_irqrestore(&rq->lock, flags); + } + + aggregate_group_shares(tg, sd); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= aggregate(tg, sd)->shares; + if (shares) { + tg->cfs_rq[sd->first_cpu]->shares += shares; + aggregate(tg, sd)->shares += shares; + } +} + +/* + * Calculate the accumulative weight and recursive load of each task group + * while walking down the tree. + */ +static +void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_weight(tg, sd); + aggregate_group_shares(tg, sd); + aggregate_group_load(tg, sd); +} + +/* + * Rebalance the cpu shares while walking back up the tree. + */ +static +void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_set_shares(tg, sd); +} + +static DEFINE_PER_CPU(spinlock_t, aggregate_lock); + +static void __init init_aggregate(void) +{ + int i; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(aggregate_lock, i)); +} + +static int get_aggregate(struct sched_domain *sd) +{ + if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) + return 0; + + aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); + return 1; +} + +static void put_aggregate(struct sched_domain *sd) +{ + spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); +} + +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ + cfs_rq->shares = shares; +} + +#else + +static inline void init_aggregate(void) +{ +} + +static inline int get_aggregate(struct sched_domain *sd) +{ + return 0; +} + +static inline void put_aggregate(struct sched_domain *sd) +{ +} +#endif + #endif #include "sched_stats.h" @@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) -static inline void inc_load(struct rq *rq, const struct task_struct *p) -{ - update_load_add(&rq->load, p->se.load.weight); -} - -static inline void dec_load(struct rq *rq, const struct task_struct *p) -{ - update_load_sub(&rq->load, p->se.load.weight); -} - -static void inc_nr_running(struct task_struct *p, struct rq *rq) +static void inc_nr_running(struct rq *rq) { rq->nr_running++; - inc_load(rq, p); } -static void dec_nr_running(struct task_struct *p, struct rq *rq) +static void dec_nr_running(struct rq *rq) { rq->nr_running--; - dec_load(rq, p); } static void set_load_weight(struct task_struct *p) @@ -1609,7 +1954,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(p, rq); + inc_nr_running(rq); } /* @@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(p, rq); + dec_nr_running(rq); } /** @@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(p, rq); + inc_nr_running(rq); } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long imbalance; struct rq *busiest; unsigned long flags; + int unlock_aggregate; cpus_setall(*cpus); + unlock_aggregate = get_aggregate(sd); + /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3383,8 +3731,9 @@ redo: if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return ld_moved; + ld_moved = -1; + + goto out; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -3399,8 +3748,13 @@ out_one_pinned: if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return 0; + ld_moved = -1; + else + ld_moved = 0; +out: + if (unlock_aggregate) + put_aggregate(sd); + return ld_moved; } /* @@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - dec_load(rq, p); - } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice) if (on_rq) { enqueue_task(rq, p, 0); - inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); sd->span = *cpu_map; + sd->first_cpu = first_cpu(sd->span); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, CPU); set_domain_attribute(sd, attr); sd->span = *nodemask; + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, MC); set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7757,6 +8113,7 @@ void __init sched_init(void) } #ifdef CONFIG_SMP + init_aggregate(); init_defrootdomain(); #endif @@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk) #endif /* CONFIG_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED -static void set_se_shares(struct sched_entity *se, unsigned long shares) +static void __set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; int on_rq; - spin_lock_irq(&rq->lock); - on_rq = se->on_rq; if (on_rq) dequeue_entity(cfs_rq, se, 0); @@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) if (on_rq) enqueue_entity(cfs_rq, se, 0); +} - spin_unlock_irq(&rq->lock); +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __set_se_shares(se, shares); + spin_unlock_irqrestore(&rq->lock, flags); } static DEFINE_MUTEX(shares_mutex); @@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * w/o tripping rebalance_share or load_balance_fair. */ tg->shares = shares; - for_each_possible_cpu(i) + for_each_possible_cpu(i) { + /* + * force a rebalance + */ + cfs_rq_set_shares(tg->cfs_rq[i], 0); set_se_shares(tg->se[i], shares); + } /* * Enable load balance activity on this group, by inserting it back on diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 8e077b9c91c..04394ccac88 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -167,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", cfs_rq->nr_spread_over); +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); +#endif +#endif } void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2e197b8e43f..183388c4dea 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -567,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ + cfs_rq->task_weight += weight; +} +#else +static inline void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +} +#endif + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + inc_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; list_add(&se->group_node, &cfs_rq->tasks); @@ -580,6 +597,10 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + dec_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, -se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; list_del_init(&se->group_node); @@ -1372,75 +1393,90 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } -#ifdef CONFIG_FAIR_GROUP_SCHED -static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) +static unsigned long +__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, + struct cfs_rq *cfs_rq) { - struct sched_entity *curr; - struct task_struct *p; - - if (!cfs_rq->nr_running || !first_fair(cfs_rq)) - return MAX_PRIO; - - curr = cfs_rq->curr; - if (!curr) - curr = __pick_next_entity(cfs_rq); + struct rq_iterator cfs_rq_iterator; - p = task_of(curr); + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + cfs_rq_iterator.arg = cfs_rq; - return p->prio; + return balance_tasks(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &cfs_rq_iterator); } -#endif +#ifdef CONFIG_FAIR_GROUP_SCHED static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { - struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; - struct rq_iterator cfs_rq_iterator; - - cfs_rq_iterator.start = load_balance_start_fair; - cfs_rq_iterator.next = load_balance_next_fair; + int busiest_cpu = cpu_of(busiest); + struct task_group *tg; - for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { -#ifdef CONFIG_FAIR_GROUP_SCHED - struct cfs_rq *this_cfs_rq; + rcu_read_lock(); + list_for_each_entry(tg, &task_groups, list) { long imbalance; - unsigned long maxload; + unsigned long this_weight, busiest_weight; + long rem_load, max_load, moved_load; + + /* + * empty group + */ + if (!aggregate(tg, sd)->task_weight) + continue; + + rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; + rem_load /= aggregate(tg, sd)->load + 1; + + this_weight = tg->cfs_rq[this_cpu]->task_weight; + busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; - this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); + imbalance = (busiest_weight - this_weight) / 2; - imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; - /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ - if (imbalance <= 0) + if (imbalance < 0) + imbalance = busiest_weight; + + max_load = max(rem_load, imbalance); + moved_load = __load_balance_fair(this_rq, this_cpu, busiest, + max_load, sd, idle, all_pinned, this_best_prio, + tg->cfs_rq[busiest_cpu]); + + if (!moved_load) continue; - /* Don't pull more than imbalance/2 */ - imbalance /= 2; - maxload = min(rem_load_move, imbalance); + move_group_shares(tg, sd, busiest_cpu, this_cpu); - *this_best_prio = cfs_rq_best_prio(this_cfs_rq); -#else -# define maxload rem_load_move -#endif - /* - * pass busy_cfs_rq argument into - * load_balance_[start|next]_fair iterators - */ - cfs_rq_iterator.arg = busy_cfs_rq; - rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, - maxload, sd, idle, all_pinned, - this_best_prio, - &cfs_rq_iterator); + moved_load *= aggregate(tg, sd)->load; + moved_load /= aggregate(tg, sd)->rq_weight + 1; - if (rem_load_move <= 0) + rem_load_move -= moved_load; + if (rem_load_move < 0) break; } + rcu_read_unlock(); return max_load_move - rem_load_move; } +#else +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + return __load_balance_fair(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &busiest->cfs); +} +#endif static int move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 6b4a6b5a416..765932d0399 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -670,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) rt_se->timeout = 0; enqueue_rt_entity(rt_se); + + inc_cpu_load(rq, p->se.load.weight); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -678,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) update_curr_rt(rq); dequeue_rt_entity(rt_se); + + dec_cpu_load(rq, p->se.load.weight); } /* -- cgit v1.2.3-70-g09d2 From b6a86c746f5b708012809958462234d19e9c8177 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:18 +0200 Subject: sched: fix sched_domain aggregation Keeping the aggregate on the first cpu of the sched domain has two problems: - it could collide between different sched domains on different cpus - it could slow things down because of the remote accesses Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched.c | 113 ++++++++++++++++++++++++-------------------------- kernel/sched_fair.c | 12 +++--- 3 files changed, 60 insertions(+), 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 97a58b622ee..eaf821072db 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -765,7 +765,6 @@ struct sched_domain { struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ cpumask_t span; /* span of all CPUs in this domain */ - int first_cpu; /* cache of the first cpu in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ diff --git a/kernel/sched.c b/kernel/sched.c index 7d282c52bd4..160d3c209b8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1480,12 +1480,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); */ static inline struct aggregate_struct * -aggregate(struct task_group *tg, struct sched_domain *sd) +aggregate(struct task_group *tg, int cpu) { - return &tg->cfs_rq[sd->first_cpu]->aggregate; + return &tg->cfs_rq[cpu]->aggregate; } -typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); +typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *); /* * Iterate the full tree, calling @down when first entering a node and @up when @@ -1493,14 +1493,14 @@ typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); */ static void aggregate_walk_tree(aggregate_func down, aggregate_func up, - struct sched_domain *sd) + int cpu, struct sched_domain *sd) { struct task_group *parent, *child; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, sd); + (*down)(parent, cpu, sd); list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1508,7 +1508,7 @@ down: up: continue; } - (*up)(parent, sd); + (*up)(parent, cpu, sd); child = parent; parent = parent->parent; @@ -1520,8 +1520,8 @@ up: /* * Calculate the aggregate runqueue weight. */ -static -void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long rq_weight = 0; unsigned long task_weight = 0; @@ -1532,15 +1532,15 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) task_weight += tg->cfs_rq[i]->task_weight; } - aggregate(tg, sd)->rq_weight = rq_weight; - aggregate(tg, sd)->task_weight = task_weight; + aggregate(tg, cpu)->rq_weight = rq_weight; + aggregate(tg, cpu)->task_weight = task_weight; } /* * Compute the weight of this group on the given cpus. */ -static -void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long shares = 0; int i; @@ -1548,18 +1548,18 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) for_each_cpu_mask(i, sd->span) shares += tg->cfs_rq[i]->shares; - if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) + if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) shares = tg->shares; - aggregate(tg, sd)->shares = shares; + aggregate(tg, cpu)->shares = shares; } /* * Compute the load fraction assigned to this group, relies on the aggregate * weight and this group's parent's load, i.e. top-down. */ -static -void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long load; @@ -1571,17 +1571,17 @@ void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) load += cpu_rq(i)->load.weight; } else { - load = aggregate(tg->parent, sd)->load; + load = aggregate(tg->parent, cpu)->load; /* * shares is our weight in the parent's rq so * shares/parent->rq_weight gives our fraction of the load */ - load *= aggregate(tg, sd)->shares; - load /= aggregate(tg->parent, sd)->rq_weight + 1; + load *= aggregate(tg, cpu)->shares; + load /= aggregate(tg->parent, cpu)->rq_weight + 1; } - aggregate(tg, sd)->load = load; + aggregate(tg, cpu)->load = load; } static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1590,8 +1590,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); * Calculate and set the cpu's group shares. */ static void -__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, - int tcpu) +__update_group_shares_cpu(struct task_group *tg, int cpu, + struct sched_domain *sd, int tcpu) { int boost = 0; unsigned long shares; @@ -1618,8 +1618,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, * \Sum rq_weight * */ - shares = aggregate(tg, sd)->shares * rq_weight; - shares /= aggregate(tg, sd)->rq_weight + 1; + shares = aggregate(tg, cpu)->shares * rq_weight; + shares /= aggregate(tg, cpu)->rq_weight + 1; /* * record the actual number of shares, not the boosted amount. @@ -1639,15 +1639,15 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, * task went to. */ static void -__move_group_shares(struct task_group *tg, struct sched_domain *sd, +__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, int scpu, int dcpu) { unsigned long shares; shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - __update_group_shares_cpu(tg, sd, scpu); - __update_group_shares_cpu(tg, sd, dcpu); + __update_group_shares_cpu(tg, cpu, sd, scpu); + __update_group_shares_cpu(tg, cpu, sd, dcpu); /* * ensure we never loose shares due to rounding errors in the @@ -1663,19 +1663,19 @@ __move_group_shares(struct task_group *tg, struct sched_domain *sd, * we need to walk up the tree and change all shares until we hit the root. */ static void -move_group_shares(struct task_group *tg, struct sched_domain *sd, +move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, int scpu, int dcpu) { while (tg) { - __move_group_shares(tg, sd, scpu, dcpu); + __move_group_shares(tg, cpu, sd, scpu, dcpu); tg = tg->parent; } } -static -void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) { - unsigned long shares = aggregate(tg, sd)->shares; + unsigned long shares = aggregate(tg, cpu)->shares; int i; for_each_cpu_mask(i, sd->span) { @@ -1683,20 +1683,20 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, sd, i); + __update_group_shares_cpu(tg, cpu, sd, i); spin_unlock_irqrestore(&rq->lock, flags); } - aggregate_group_shares(tg, sd); + aggregate_group_shares(tg, cpu, sd); /* * ensure we never loose shares due to rounding errors in the * above redistribution. */ - shares -= aggregate(tg, sd)->shares; + shares -= aggregate(tg, cpu)->shares; if (shares) { - tg->cfs_rq[sd->first_cpu]->shares += shares; - aggregate(tg, sd)->shares += shares; + tg->cfs_rq[cpu]->shares += shares; + aggregate(tg, cpu)->shares += shares; } } @@ -1704,21 +1704,21 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) * Calculate the accumulative weight and recursive load of each task group * while walking down the tree. */ -static -void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd) { - aggregate_group_weight(tg, sd); - aggregate_group_shares(tg, sd); - aggregate_group_load(tg, sd); + aggregate_group_weight(tg, cpu, sd); + aggregate_group_shares(tg, cpu, sd); + aggregate_group_load(tg, cpu, sd); } /* * Rebalance the cpu shares while walking back up the tree. */ -static -void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) { - aggregate_group_set_shares(tg, sd); + aggregate_group_set_shares(tg, cpu, sd); } static DEFINE_PER_CPU(spinlock_t, aggregate_lock); @@ -1731,18 +1731,18 @@ static void __init init_aggregate(void) spin_lock_init(&per_cpu(aggregate_lock, i)); } -static int get_aggregate(struct sched_domain *sd) +static int get_aggregate(int cpu, struct sched_domain *sd) { - if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) + if (!spin_trylock(&per_cpu(aggregate_lock, cpu))) return 0; - aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); + aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd); return 1; } -static void put_aggregate(struct sched_domain *sd) +static void put_aggregate(int cpu, struct sched_domain *sd) { - spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); + spin_unlock(&per_cpu(aggregate_lock, cpu)); } static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) @@ -1756,12 +1756,12 @@ static inline void init_aggregate(void) { } -static inline int get_aggregate(struct sched_domain *sd) +static inline int get_aggregate(int cpu, struct sched_domain *sd) { return 0; } -static inline void put_aggregate(struct sched_domain *sd) +static inline void put_aggregate(int cpu, struct sched_domain *sd) { } #endif @@ -3539,7 +3539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpus_setall(*cpus); - unlock_aggregate = get_aggregate(sd); + unlock_aggregate = get_aggregate(this_cpu, sd); /* * When power savings policy is enabled for the parent domain, idle @@ -3678,7 +3678,7 @@ out_one_pinned: ld_moved = 0; out: if (unlock_aggregate) - put_aggregate(sd); + put_aggregate(this_cpu, sd); return ld_moved; } @@ -7292,7 +7292,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); sd->span = *cpu_map; - sd->first_cpu = first_cpu(sd->span); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7303,7 +7302,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7315,7 +7313,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, CPU); set_domain_attribute(sd, attr); sd->span = *nodemask; - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7327,7 +7324,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, MC); set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7340,7 +7336,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 509092af033..40cf24ab4de 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1429,11 +1429,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, /* * empty group */ - if (!aggregate(tg, sd)->task_weight) + if (!aggregate(tg, this_cpu)->task_weight) continue; - rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; - rem_load /= aggregate(tg, sd)->load + 1; + rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; + rem_load /= aggregate(tg, this_cpu)->load + 1; this_weight = tg->cfs_rq[this_cpu]->task_weight; busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; @@ -1451,10 +1451,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!moved_load) continue; - move_group_shares(tg, sd, busiest_cpu, this_cpu); + move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu); - moved_load *= aggregate(tg, sd)->load; - moved_load /= aggregate(tg, sd)->rq_weight + 1; + moved_load *= aggregate(tg, this_cpu)->load; + moved_load /= aggregate(tg, this_cpu)->rq_weight + 1; rem_load_move -= moved_load; if (rem_load_move < 0) -- cgit v1.2.3-70-g09d2 From 2398f2c6d34b43025f274fc42eaca34d23ec2320 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:35 +0200 Subject: sched: update shares on wakeup We found that the affine wakeup code needs rather accurate load figures to be effective. The trouble is that updating the load figures is fairly expensive with group scheduling. Therefore ratelimit the updating. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/sched.c | 30 +++++++++++++++++++++++++++++- kernel/sched_features.h | 3 ++- kernel/sysctl.c | 8 ++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index eaf821072db..835b6c6fcc5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -783,6 +783,8 @@ struct sched_domain { unsigned int balance_interval; /* initialise to 1. units in ms. */ unsigned int nr_balance_failed; /* initialise to 0 */ + u64 last_update; + #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; @@ -1605,6 +1607,7 @@ extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; +extern unsigned int sysctl_sched_shares_ratelimit; int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, diff --git a/kernel/sched.c b/kernel/sched.c index 1cff969f664..62db0891025 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -777,6 +777,12 @@ late_initcall(sched_init_debug); */ const_debug unsigned int sysctl_sched_nr_migrate = 32; +/* + * ratelimit for updating the group shares. + * default: 0.5ms + */ +const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; + /* * period over which we measure -rt task cpu usage in us. * default: 1s @@ -1590,7 +1596,13 @@ tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) static void update_shares(struct sched_domain *sd) { - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + u64 now = cpu_clock(raw_smp_processor_id()); + s64 elapsed = now - sd->last_update; + + if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { + sd->last_update = now; + walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + } } static void update_shares_locked(struct rq *rq, struct sched_domain *sd) @@ -2199,6 +2211,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; +#ifdef CONFIG_SMP + if (sched_feat(LB_WAKEUP_UPDATE)) { + struct sched_domain *sd; + + this_cpu = raw_smp_processor_id(); + cpu = task_cpu(p); + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + update_shares(sd); + break; + } + } + } +#endif + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index d56e3053e74..7d616d2a2a3 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -8,4 +8,5 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) -SCHED_FEAT(LB_BIAS, 0) \ No newline at end of file +SCHED_FEAT(LB_BIAS, 0) +SCHED_FEAT(LB_WAKEUP_UPDATE, 1) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29116652dca..fe8cdc80ff0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -264,6 +264,14 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_shares_ratelimit", + .data = &sysctl_sched_shares_ratelimit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", -- cgit v1.2.3-70-g09d2 From b660398101cd0622325480a67ac88bb4d33d553a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 27 Jun 2008 14:39:42 +0100 Subject: kbuild: fix a.out.h export to userspace with O= build. We need to check for existence of the a.out.h header in the source tree, not the object tree, if we want it to get the right answer with O=. Signed-off-by: David Woodhouse Signed-off-by: Sam Ravnborg --- include/asm-generic/Kbuild.asm | 2 +- include/asm-powerpc/Kbuild | 1 - include/linux/Kbuild | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm index 92a6d91d0c1..7cd25b8e7c9 100644 --- a/include/asm-generic/Kbuild.asm +++ b/include/asm-generic/Kbuild.asm @@ -1,6 +1,6 @@ header-y += kvm.h -ifeq ($(wildcard include/asm-$(SRCARCH)/a.out.h),include/asm-$(SRCARCH)/a.out.h) +ifneq ($(wildcard $(srctree)/include/asm-$(SRCARCH)/a.out.h),) unifdef-y += a.out.h endif unifdef-y += auxvec.h diff --git a/include/asm-powerpc/Kbuild b/include/asm-powerpc/Kbuild index 7381916dfcb..bca352e033c 100644 --- a/include/asm-powerpc/Kbuild +++ b/include/asm-powerpc/Kbuild @@ -1,6 +1,5 @@ include include/asm-generic/Kbuild.asm -header-y += a.out.h header-y += auxvec.h header-y += ioctls.h header-y += mman.h diff --git a/include/linux/Kbuild b/include/linux/Kbuild index b6fbb2573e8..71d70d1fbce 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -166,7 +166,7 @@ unifdef-y += acct.h unifdef-y += adb.h unifdef-y += adfs_fs.h unifdef-y += agpgart.h -ifeq ($(wildcard include/asm-$(SRCARCH)/a.out.h),include/asm-$(SRCARCH)/a.out.h) +ifneq ($(wildcard $(srctree)/include/asm-$(SRCARCH)/a.out.h),) unifdef-y += a.out.h endif unifdef-y += apm_bios.h -- cgit v1.2.3-70-g09d2 From c88e6f51c2154c7606f7e281bcca2d1a2c89d7b2 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 27 Jun 2008 19:54:54 -0700 Subject: include/linux/netdevice.h: don't export MAX_HEADER to userspace Due to the CONFIG_'s the value is anyway not correct in userspace. Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f27fd200933..25f87102ab6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -88,6 +88,8 @@ struct wireless_dev; #define NETDEV_TX_BUSY 1 /* driver tx path was busy*/ #define NETDEV_TX_LOCKED -1 /* driver tx lock was already taken */ +#ifdef __KERNEL__ + /* * Compute the worst case header length according to the protocols * used. @@ -114,6 +116,8 @@ struct wireless_dev; #define MAX_HEADER (LL_MAX_HEADER + 48) #endif +#endif /* __KERNEL__ */ + struct net_device_subqueue { /* Give a control state for each queue. This struct may contain -- cgit v1.2.3-70-g09d2 From 251a4b320f2352598f84e4452ab538aa8064af52 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Fri, 27 Jun 2008 20:09:00 -0700 Subject: net/inet_lro: remove setting skb->ip_summed when not LRO-able When an SKB cannot be chained to a session, the current code attempts to "restore" its ip_summed field from lro_mgr->ip_summed. However, lro_mgr->ip_summed does not hold the original value; in fact, we'd better not touch skb->ip_summed since it is not modified by the code in the path leading to a failure to chain it. Also use a cleaer comment to the describe the ip_summed field of struct net_lro_mgr. Issue raised by Or Gerlitz Signed-off-by: Eli Cohen Signed-off-by: David S. Miller --- include/linux/inet_lro.h | 6 +++++- net/ipv4/inet_lro.c | 3 +-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inet_lro.h b/include/linux/inet_lro.h index 80335b7d77c..c4335faebb6 100644 --- a/include/linux/inet_lro.h +++ b/include/linux/inet_lro.h @@ -84,7 +84,11 @@ struct net_lro_mgr { from received packets and eth protocol is still ETH_P_8021Q */ - u32 ip_summed; /* Set in non generated SKBs in page mode */ + /* + * Set for generated SKBs that are not added to + * the frag list in fragmented mode + */ + u32 ip_summed; u32 ip_summed_aggr; /* Set in aggregated SKBs: CHECKSUM_UNNECESSARY * or CHECKSUM_NONE */ diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 4a4d49fca1f..cfd034a2b96 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -383,8 +383,7 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, out2: /* send aggregated SKBs to stack */ lro_flush(lro_mgr, lro_desc); -out: /* Original SKB has to be posted to stack */ - skb->ip_summed = lro_mgr->ip_summed; +out: return 1; } -- cgit v1.2.3-70-g09d2 From 4bbff7e408a54cce88d26191191e8bcda2a60d55 Mon Sep 17 00:00:00 2001 From: Bastien Nocera Date: Thu, 26 Jun 2008 09:13:48 -0400 Subject: Input: add KEY_MEDIA_REPEAT definition This patch adds the Repeat key to the input layer. The usage in the HUT is 0xBC (listed under "15.7 Transport Controls"). Signed-off-by: Dmitry Torokhov --- include/linux/input.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/input.h b/include/linux/input.h index e075c4b762f..d150c57e5f0 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -534,8 +534,8 @@ struct input_absinfo { #define KEY_FRAMEBACK 0x1b4 /* Consumer - transport controls */ #define KEY_FRAMEFORWARD 0x1b5 - #define KEY_CONTEXT_MENU 0x1b6 /* GenDesc - system context menu */ +#define KEY_MEDIA_REPEAT 0x1b7 /* Consumer - transport control */ #define KEY_DEL_EOL 0x1c0 #define KEY_DEL_EOS 0x1c1 -- cgit v1.2.3-70-g09d2 From 18ce3751ccd488c78d3827e9f6bf54e6322676fb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 1 Jul 2008 09:07:34 +0200 Subject: Properly notify block layer of sync writes fsync_buffers_list() and sync_dirty_buffer() both issue async writes and then immediately wait on them. Conceptually, that makes them sync writes and we should treat them as such so that the IO schedulers can handle them appropriately. This patch fixes a write starvation issue that Lin Ming reported, where xx is stuck for more than 2 minutes because of a large number of synchronous IO in the system: INFO: task kjournald:20558 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kjournald D ffff810010820978 6712 20558 2 ffff81022ddb1d10 0000000000000046 ffff81022e7baa10 ffffffff803ba6f2 ffff81022ecd0000 ffff8101e6dc9160 ffff81022ecd0348 000000008048b6cb 0000000000000086 ffff81022c4e8d30 0000000000000000 ffffffff80247537 Call Trace: [] kobject_get+0x12/0x17 [] getnstimeofday+0x2f/0x83 [] sync_buffer+0x0/0x3f [] io_schedule+0x5d/0x9f [] sync_buffer+0x3b/0x3f [] __wait_on_bit+0x40/0x6f [] sync_buffer+0x0/0x3f [] out_of_line_wait_on_bit+0x6c/0x78 [] wake_bit_function+0x0/0x23 [] sync_dirty_buffer+0x98/0xcb [] journal_commit_transaction+0x97d/0xcb6 [] lock_timer_base+0x26/0x4b [] kjournald+0xc1/0x1fb [] autoremove_wake_function+0x0/0x2e [] kjournald+0x0/0x1fb [] kthread+0x47/0x74 [] schedule_tail+0x28/0x5d [] child_rip+0xa/0x12 [] kthread+0x0/0x74 [] child_rip+0x0/0x12 Lin Ming confirms that this patch fixes the issue. I've run tests with it for the past week and no ill effects have been observed, so I'm proposing it for inclusion into 2.6.26. Signed-off-by: Jens Axboe --- fs/buffer.c | 13 ++++++++----- include/linux/fs.h | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index a073f3f4f01..0f51c0f7c26 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * contents - it is a noop if I/O is still in * flight on potentially older contents. */ - ll_rw_block(SWRITE, 1, &bh); + ll_rw_block(SWRITE_SYNC, 1, &bh); brelse(bh); spin_lock(lock); } @@ -2940,16 +2940,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) for (i = 0; i < nr; i++) { struct buffer_head *bh = bhs[i]; - if (rw == SWRITE) + if (rw == SWRITE || rw == SWRITE_SYNC) lock_buffer(bh); else if (test_set_buffer_locked(bh)) continue; - if (rw == WRITE || rw == SWRITE) { + if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE, bh); + if (rw == SWRITE_SYNC) + submit_bh(WRITE_SYNC, bh); + else + submit_bh(WRITE, bh); continue; } } else { @@ -2978,7 +2981,7 @@ int sync_dirty_buffer(struct buffer_head *bh) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7c108082683..d8e2762ed14 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -83,6 +83,7 @@ extern int dir_notify_enable; #define READ_SYNC (READ | (1 << BIO_RW_SYNC)) #define READ_META (READ | (1 << BIO_RW_META)) #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) +#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNC)) #define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER)) #define SEL_IN 1 -- cgit v1.2.3-70-g09d2 From 8e29da9ee8958cc17e27f4053420f1c982614793 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 1 Jul 2008 22:38:18 +0200 Subject: i2c: Fix bad hint about irqs in i2c.h i2c.h mentions -1 as a not-issued irq. This false hint was taken by of_i2c and caused crashes. Don't give any advice as 'no irq' is not consistent across all architectures yet and it is not needed internally by the i2c-core. Signed-off-by: Wolfram Sang Signed-off-by: Jean Delvare --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index fb9af6a0fe9..8dc73013219 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -171,7 +171,7 @@ struct i2c_client { struct i2c_adapter *adapter; /* the adapter we sit on */ struct i2c_driver *driver; /* and our access routines */ struct device dev; /* the device structure */ - int irq; /* irq issued by device (or -1) */ + int irq; /* irq issued by device */ struct list_head list; /* DEPRECATED */ struct completion released; }; -- cgit v1.2.3-70-g09d2 From 9465efc9e96135a2cec8154c0c766fa59984a298 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 27 Jun 2008 11:05:24 +0200 Subject: Remove BKL from remote_llseek v2 - Replace remote_llseek with generic_file_llseek_unlocked (to force compilation failures in all users) - Change all users to either use generic_file_llseek_unlocked directly or take the BKL around. I changed the file systems who don't use the BKL for anything (CIFS, GFS) to call it directly. NCPFS and SMBFS and NFS take the BKL, but explicitely in their own source now. I moved them all over in a single patch to avoid unbisectable sections. Open problem: 32bit kernels can corrupt fpos because its modification is not atomic, but they can do that anyways because there's other paths who modify it without BKL. Do we need a special lock for the pos/f_version = 0 checks? Trond says the NFS BKL is likely not needed, but keep it for now until his full audit. v2: Use generic_file_llseek_unlocked instead of remote_llseek_unlocked and factor duplicated code (suggested by hch) Cc: Trond.Myklebust@netapp.com Cc: swhiteho@redhat.com Cc: sfrench@samba.org Cc: vandrove@vc.cvut.cz Signed-off-by: Andi Kleen Signed-off-by: Andi Kleen Signed-off-by: Jonathan Corbet --- fs/cifs/cifsfs.c | 2 +- fs/gfs2/ops_file.c | 4 ++-- fs/ncpfs/file.c | 12 +++++++++++- fs/nfs/file.c | 6 +++++- fs/read_write.c | 38 +++++++++++--------------------------- fs/smbfs/file.c | 11 ++++++++++- include/linux/fs.h | 3 ++- 7 files changed, 42 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 427a7c69589..aeff0fe5b6b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -581,7 +581,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) if (retval < 0) return (loff_t)retval; } - return remote_llseek(file, offset, origin); + return generic_file_llseek_unlocked(file, offset, origin); } struct file_system_type cifs_fs_type = { diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index e1b7d525a06..24dd5945008 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -62,11 +62,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (!error) { - error = remote_llseek(file, offset, origin); + error = generic_file_llseek_unlocked(file, offset, origin); gfs2_glock_dq_uninit(&i_gh); } } else - error = remote_llseek(file, offset, origin); + error = generic_file_llseek_unlocked(file, offset, origin); return error; } diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 2b145de45b3..6a7d901f193 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "ncplib_kernel.h" @@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) { return 0; } +static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + lock_kernel(); + ret = generic_file_llseek_unlocked(file, offset, origin); + unlock_kernel(); + return ret; +} + const struct file_operations ncp_file_operations = { - .llseek = remote_llseek, + .llseek = ncp_remote_llseek, .read = ncp_file_read, .write = ncp_file_write, .ioctl = ncp_ioctl, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 3536b01164f..a34eb78989f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -170,6 +170,7 @@ force_reval: static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) { + loff_t loff; /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { struct inode *inode = filp->f_mapping->host; @@ -177,7 +178,10 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) if (retval < 0) return (loff_t)retval; } - return remote_llseek(filp, offset, origin); + lock_kernel(); /* BKL needed? */ + loff = generic_file_llseek_unlocked(filp, offset, origin); + unlock_kernel(); + return loff; } /* diff --git a/fs/read_write.c b/fs/read_write.c index f0d1240a5c6..9ba495d5a29 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); -loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) +loff_t +generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) { loff_t retval; struct inode *inode = file->f_mapping->host; - mutex_lock(&inode->i_mutex); switch (origin) { case SEEK_END: offset += inode->i_size; @@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) } retval = -EINVAL; if (offset>=0 && offset<=inode->i_sb->s_maxbytes) { + /* Special lock needed here? */ if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } retval = offset; } - mutex_unlock(&inode->i_mutex); return retval; } +EXPORT_SYMBOL(generic_file_llseek_unlocked); -EXPORT_SYMBOL(generic_file_llseek); - -loff_t remote_llseek(struct file *file, loff_t offset, int origin) +loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) { - loff_t retval; - - lock_kernel(); - switch (origin) { - case SEEK_END: - offset += i_size_read(file->f_path.dentry->d_inode); - break; - case SEEK_CUR: - offset += file->f_pos; - } - retval = -EINVAL; - if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) { - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - retval = offset; - } - unlock_kernel(); - return retval; + loff_t n; + mutex_lock(&file->f_dentry->d_inode->i_mutex); + n = generic_file_llseek_unlocked(file, offset, origin); + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + return n; } -EXPORT_SYMBOL(remote_llseek); +EXPORT_SYMBOL(generic_file_llseek); loff_t no_llseek(struct file *file, loff_t offset, int origin) { diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index efbe29af3d7..2294783320c 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd) return error; } +static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + lock_kernel(); + ret = generic_file_llseek_unlocked(file, offset, origin); + unlock_kernel(); + return ret; +} + const struct file_operations smb_file_operations = { - .llseek = remote_llseek, + .llseek = smb_remote_llseek, .read = do_sync_read, .aio_read = smb_file_aio_read, .write = do_sync_write, diff --git a/include/linux/fs.h b/include/linux/fs.h index f413085f748..b158e5161bc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1871,7 +1871,8 @@ extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t no_llseek(struct file *file, loff_t offset, int origin); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); -extern loff_t remote_llseek(struct file *file, loff_t offset, int origin); +extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset, + int origin); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); -- cgit v1.2.3-70-g09d2 From 02c62304e6af60f1963695c6bc1bbffe619aa585 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Wed, 11 Jun 2008 09:12:52 +0200 Subject: Added in user-injected messages into blk traces This allows a user to annotate the blk trace stream: writing a suitable message to {/sys/kernel/debug}/block//msg will have it propagated into the trace stream. Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- block/blktrace.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/blktrace_api.h | 1 + 2 files changed, 46 insertions(+) (limited to 'include/linux') diff --git a/block/blktrace.c b/block/blktrace.c index 8d3a2778026..eb9651ccb24 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -244,6 +244,7 @@ err: static void blk_trace_cleanup(struct blk_trace *bt) { relay_close(bt->rchan); + debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); blk_remove_tree(bt->dir); free_percpu(bt->sequence); @@ -291,6 +292,44 @@ static const struct file_operations blk_dropped_fops = { .read = blk_dropped_read, }; +static int blk_msg_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + + return 0; +} + +static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *msg; + struct blk_trace *bt; + + if (count > BLK_TN_MAX_MSG) + return -EINVAL; + + msg = kmalloc(count, GFP_KERNEL); + if (msg == NULL) + return -ENOMEM; + + if (copy_from_user(msg, buffer, count)) { + kfree(msg); + return -EFAULT; + } + + bt = filp->private_data; + __trace_note_message(bt, "%s", msg); + kfree(msg); + + return count; +} + +static const struct file_operations blk_msg_fops = { + .owner = THIS_MODULE, + .open = blk_msg_open, + .write = blk_msg_write, +}; + /* * Keep track of how many times we encountered a full subbuffer, to aid * the user space app in telling how many lost events there were. @@ -380,6 +419,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->dropped_file) goto err; + bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); + if (!bt->msg_file) + goto err; + bt->rchan = relay_open("trace", dir, buts->buf_size, buts->buf_nr, &blk_relay_callbacks, bt); if (!bt->rchan) @@ -409,6 +452,8 @@ err: if (dir) blk_remove_tree(dir); if (bt) { + if (bt->msg_file) + debugfs_remove(bt->msg_file); if (bt->dropped_file) debugfs_remove(bt->dropped_file); free_percpu(bt->sequence); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index e3ef903aae8..d084b8d227a 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -129,6 +129,7 @@ struct blk_trace { u32 dev; struct dentry *dir; struct dentry *dropped_file; + struct dentry *msg_file; atomic_t dropped; }; -- cgit v1.2.3-70-g09d2 From 244b4d56f85bcd11b21ab0b94845a3dabeed5c10 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 12 Jun 2008 20:12:36 +0200 Subject: block: kill request_queue_t Everything was moved to struct request_queue a few kernel revisions ago, maintaining the deprecated typedef to avoid breaking things. Now the time has come to get rid of that typedef. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d2a1b71e93c..6a3da671713 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -23,7 +23,6 @@ struct scsi_ioctl_command; struct request_queue; -typedef struct request_queue request_queue_t __deprecated; struct elevator_queue; typedef struct elevator_queue elevator_t; struct request_pm_state; -- cgit v1.2.3-70-g09d2 From 51d654e1d885607a6edd02b337105fa5c28b6d33 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 17 Jun 2008 18:59:56 +0200 Subject: block: Globalize bio_set and bio_vec_slab Move struct bio_set and biovec_slab definitions to bio.h so they can be used outside of bio.c. Signed-off-by: Martin K. Petersen Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- fs/bio.c | 30 ++---------------------------- include/linux/bio.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/fs/bio.c b/fs/bio.c index 78562574cb5..7a6598abc96 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -28,25 +28,10 @@ #include #include /* for struct sg_iovec */ -#define BIO_POOL_SIZE 2 - static struct kmem_cache *bio_slab __read_mostly; -#define BIOVEC_NR_POOLS 6 - -/* - * a small number of entries is fine, not going to be performance critical. - * basically we just need to survive - */ -#define BIO_SPLIT_ENTRIES 2 mempool_t *bio_split_pool __read_mostly; -struct biovec_slab { - int nr_vecs; - char *name; - struct kmem_cache *slab; -}; - /* * if you change this list, also change bvec_alloc or things will * break badly! cannot be bigger than what you can fit into an @@ -59,24 +44,13 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { }; #undef BV -/* - * bio_set is used to allow other portions of the IO system to - * allocate their own private memory pools for bio and iovec structures. - * These memory pools in turn all allocate from the bio_slab - * and the bvec_slabs[]. - */ -struct bio_set { - mempool_t *bio_pool; - mempool_t *bvec_pools[BIOVEC_NR_POOLS]; -}; - /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. */ -static struct bio_set *fs_bio_set; +struct bio_set *fs_bio_set; -static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) +struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) { struct bio_vec *bvl; diff --git a/include/linux/bio.h b/include/linux/bio.h index 61c15eaf3fb..49dfb3cb746 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -333,6 +333,35 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *, int, int); extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); +extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); + +/* + * bio_set is used to allow other portions of the IO system to + * allocate their own private memory pools for bio and iovec structures. + * These memory pools in turn all allocate from the bio_slab + * and the bvec_slabs[]. + */ +#define BIO_POOL_SIZE 2 +#define BIOVEC_NR_POOLS 6 + +struct bio_set { + mempool_t *bio_pool; + mempool_t *bvec_pools[BIOVEC_NR_POOLS]; +}; + +struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +}; + +extern struct bio_set *fs_bio_set; + +/* + * a small number of entries is fine, not going to be performance critical. + * basically we just need to survive + */ +#define BIO_SPLIT_ENTRIES 2 #ifdef CONFIG_HIGHMEM /* -- cgit v1.2.3-70-g09d2 From 7ba1ba12eeef0aa7113beb16410ef8b7c748e18b Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 30 Jun 2008 20:04:41 +0200 Subject: block: Block layer data integrity support Some block devices support verifying the integrity of requests by way of checksums or other protection information that is submitted along with the I/O. This patch implements support for generating and verifying integrity metadata, as well as correctly merging, splitting and cloning bios and requests that have this extra information attached. See Documentation/block/data-integrity.txt for more information. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/Kconfig | 12 + block/Makefile | 1 + block/blk-core.c | 7 + block/blk-integrity.c | 382 ++++++++++++++++++++++++++ block/blk-merge.c | 3 + block/blk.h | 8 + block/elevator.c | 6 + fs/Makefile | 1 + fs/bio-integrity.c | 708 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/bio.c | 32 ++- include/linux/bio.h | 94 ++++++- include/linux/blkdev.h | 105 ++++++++ include/linux/genhd.h | 3 + 13 files changed, 1355 insertions(+), 7 deletions(-) create mode 100644 block/blk-integrity.c create mode 100644 fs/bio-integrity.c (limited to 'include/linux') diff --git a/block/Kconfig b/block/Kconfig index 3e97f2bc446..1ab7c15c8d7 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -81,6 +81,18 @@ config BLK_DEV_BSG If unsure, say N. +config BLK_DEV_INTEGRITY + bool "Block layer data integrity support" + ---help--- + Some storage devices allow extra information to be + stored/retrieved to help protect the data. The block layer + data integrity option provides hooks which can be used by + filesystems to ensure better data integrity. + + Say yes here if you have a storage device that provides the + T10/SCSI Data Integrity Field or the T13/ATA External Path + Protection. If in doubt, say N. + endif # BLOCK config BLOCK_COMPAT diff --git a/block/Makefile b/block/Makefile index 5a43c7d7959..045f7b62e4b 100644 --- a/block/Makefile +++ b/block/Makefile @@ -14,3 +14,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/block/blk-core.c b/block/blk-core.c index 1905aaba49f..e0fb0bcc0c1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -143,6 +143,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio->bi_size -= nbytes; bio->bi_sector += (nbytes >> 9); + + if (bio_integrity(bio)) + bio_integrity_advance(bio, nbytes); + if (bio->bi_size == 0) bio_endio(bio, error); } else { @@ -1381,6 +1385,9 @@ end_io: */ blk_partition_remap(bio); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) + goto end_io; + if (old_sector != -1) blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, old_sector); diff --git a/block/blk-integrity.c b/block/blk-integrity.c new file mode 100644 index 00000000000..65f23ef38bb --- /dev/null +++ b/block/blk-integrity.c @@ -0,0 +1,382 @@ +/* + * blk-integrity.c - Block layer data integrity extensions + * + * Copyright (C) 2007, 2008 Oracle Corporation + * Written by: Martin K. Petersen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include +#include +#include +#include + +#include "blk.h" + +static struct kmem_cache *integrity_cachep; + +/** + * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements + * @rq: request with integrity metadata attached + * + * Description: Returns the number of elements required in a + * scatterlist corresponding to the integrity metadata in a request. + */ +int blk_rq_count_integrity_sg(struct request *rq) +{ + struct bio_vec *iv, *ivprv; + struct req_iterator iter; + unsigned int segments; + + ivprv = NULL; + segments = 0; + + rq_for_each_integrity_segment(iv, rq, iter) { + + if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + segments++; + + ivprv = iv; + } + + return segments; +} +EXPORT_SYMBOL(blk_rq_count_integrity_sg); + +/** + * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist + * @rq: request with integrity metadata attached + * @sglist: target scatterlist + * + * Description: Map the integrity vectors in request into a + * scatterlist. The scatterlist must be big enough to hold all + * elements. I.e. sized using blk_rq_count_integrity_sg(). + */ +int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) +{ + struct bio_vec *iv, *ivprv; + struct req_iterator iter; + struct scatterlist *sg; + unsigned int segments; + + ivprv = NULL; + sg = NULL; + segments = 0; + + rq_for_each_integrity_segment(iv, rq, iter) { + + if (ivprv) { + if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) + goto new_segment; + + sg->length += iv->bv_len; + } else { +new_segment: + if (!sg) + sg = sglist; + else { + sg->page_link &= ~0x02; + sg = sg_next(sg); + } + + sg_set_page(sg, iv->bv_page, iv->bv_len, iv->bv_offset); + segments++; + } + + ivprv = iv; + } + + if (sg) + sg_mark_end(sg); + + return segments; +} +EXPORT_SYMBOL(blk_rq_map_integrity_sg); + +/** + * blk_integrity_compare - Compare integrity profile of two block devices + * @b1: Device to compare + * @b2: Device to compare + * + * Description: Meta-devices like DM and MD need to verify that all + * sub-devices use the same integrity format before advertising to + * upper layers that they can send/receive integrity metadata. This + * function can be used to check whether two block devices have + * compatible integrity formats. + */ +int blk_integrity_compare(struct block_device *bd1, struct block_device *bd2) +{ + struct blk_integrity *b1 = bd1->bd_disk->integrity; + struct blk_integrity *b2 = bd2->bd_disk->integrity; + + BUG_ON(bd1->bd_disk == NULL); + BUG_ON(bd2->bd_disk == NULL); + + if (!b1 || !b2) + return 0; + + if (b1->sector_size != b2->sector_size) { + printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->sector_size, b2->sector_size); + return -1; + } + + if (b1->tuple_size != b2->tuple_size) { + printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->tuple_size, b2->tuple_size); + return -1; + } + + if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { + printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->tag_size, b2->tag_size); + return -1; + } + + if (strcmp(b1->name, b2->name)) { + printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__, + bd1->bd_disk->disk_name, bd2->bd_disk->disk_name, + b1->name, b2->name); + return -1; + } + + return 0; +} +EXPORT_SYMBOL(blk_integrity_compare); + +struct integrity_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_integrity *, char *); + ssize_t (*store)(struct blk_integrity *, const char *, size_t); +}; + +static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct blk_integrity *bi = + container_of(kobj, struct blk_integrity, kobj); + struct integrity_sysfs_entry *entry = + container_of(attr, struct integrity_sysfs_entry, attr); + + return entry->show(bi, page); +} + +static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t count) +{ + struct blk_integrity *bi = + container_of(kobj, struct blk_integrity, kobj); + struct integrity_sysfs_entry *entry = + container_of(attr, struct integrity_sysfs_entry, attr); + ssize_t ret = 0; + + if (entry->store) + ret = entry->store(bi, page, count); + + return ret; +} + +static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) +{ + if (bi != NULL && bi->name != NULL) + return sprintf(page, "%s\n", bi->name); + else + return sprintf(page, "none\n"); +} + +static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) +{ + if (bi != NULL) + return sprintf(page, "%u\n", bi->tag_size); + else + return sprintf(page, "0\n"); +} + +static ssize_t integrity_read_store(struct blk_integrity *bi, + const char *page, size_t count) +{ + char *p = (char *) page; + unsigned long val = simple_strtoul(p, &p, 10); + + if (val) + set_bit(INTEGRITY_FLAG_READ, &bi->flags); + else + clear_bit(INTEGRITY_FLAG_READ, &bi->flags); + + return count; +} + +static ssize_t integrity_read_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%d\n", + test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0); +} + +static ssize_t integrity_write_store(struct blk_integrity *bi, + const char *page, size_t count) +{ + char *p = (char *) page; + unsigned long val = simple_strtoul(p, &p, 10); + + if (val) + set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + else + clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + + return count; +} + +static ssize_t integrity_write_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%d\n", + test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0); +} + +static struct integrity_sysfs_entry integrity_format_entry = { + .attr = { .name = "format", .mode = S_IRUGO }, + .show = integrity_format_show, +}; + +static struct integrity_sysfs_entry integrity_tag_size_entry = { + .attr = { .name = "tag_size", .mode = S_IRUGO }, + .show = integrity_tag_size_show, +}; + +static struct integrity_sysfs_entry integrity_read_entry = { + .attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR }, + .show = integrity_read_show, + .store = integrity_read_store, +}; + +static struct integrity_sysfs_entry integrity_write_entry = { + .attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR }, + .show = integrity_write_show, + .store = integrity_write_store, +}; + +static struct attribute *integrity_attrs[] = { + &integrity_format_entry.attr, + &integrity_tag_size_entry.attr, + &integrity_read_entry.attr, + &integrity_write_entry.attr, + NULL, +}; + +static struct sysfs_ops integrity_ops = { + .show = &integrity_attr_show, + .store = &integrity_attr_store, +}; + +static int __init blk_dev_integrity_init(void) +{ + integrity_cachep = kmem_cache_create("blkdev_integrity", + sizeof(struct blk_integrity), + 0, SLAB_PANIC, NULL); + return 0; +} +subsys_initcall(blk_dev_integrity_init); + +static void blk_integrity_release(struct kobject *kobj) +{ + struct blk_integrity *bi = + container_of(kobj, struct blk_integrity, kobj); + + kmem_cache_free(integrity_cachep, bi); +} + +static struct kobj_type integrity_ktype = { + .default_attrs = integrity_attrs, + .sysfs_ops = &integrity_ops, + .release = blk_integrity_release, +}; + +/** + * blk_integrity_register - Register a gendisk as being integrity-capable + * @disk: struct gendisk pointer to make integrity-aware + * @template: integrity profile + * + * Description: When a device needs to advertise itself as being able + * to send/receive integrity metadata it must use this function to + * register the capability with the block layer. The template is a + * blk_integrity struct with values appropriate for the underlying + * hardware. See Documentation/block/data-integrity.txt. + */ +int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) +{ + struct blk_integrity *bi; + + BUG_ON(disk == NULL); + BUG_ON(template == NULL); + + if (disk->integrity == NULL) { + bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO); + if (!bi) + return -1; + + if (kobject_init_and_add(&bi->kobj, &integrity_ktype, + &disk->dev.kobj, "%s", "integrity")) { + kmem_cache_free(integrity_cachep, bi); + return -1; + } + + kobject_uevent(&bi->kobj, KOBJ_ADD); + + set_bit(INTEGRITY_FLAG_READ, &bi->flags); + set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->sector_size = disk->queue->hardsect_size; + disk->integrity = bi; + } else + bi = disk->integrity; + + /* Use the provided profile as template */ + bi->name = template->name; + bi->generate_fn = template->generate_fn; + bi->verify_fn = template->verify_fn; + bi->tuple_size = template->tuple_size; + bi->set_tag_fn = template->set_tag_fn; + bi->get_tag_fn = template->get_tag_fn; + bi->tag_size = template->tag_size; + + return 0; +} +EXPORT_SYMBOL(blk_integrity_register); + +/** + * blk_integrity_unregister - Remove block integrity profile + * @disk: disk whose integrity profile to deallocate + * + * Description: This function frees all memory used by the block + * integrity profile. To be called at device teardown. + */ +void blk_integrity_unregister(struct gendisk *disk) +{ + struct blk_integrity *bi; + + if (!disk || !disk->integrity) + return; + + bi = disk->integrity; + + kobject_uevent(&bi->kobj, KOBJ_REMOVE); + kobject_del(&bi->kobj); + kobject_put(&disk->dev.kobj); + kmem_cache_free(integrity_cachep, bi); +} +EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-merge.c b/block/blk-merge.c index 651136aae76..5efc9e7a68b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -441,6 +441,9 @@ static int attempt_merge(struct request_queue *q, struct request *req, || next->special) return 0; + if (blk_integrity_rq(req) != blk_integrity_rq(next)) + return 0; + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn diff --git a/block/blk.h b/block/blk.h index 59776ab4742..c79f30e1df5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -51,4 +51,12 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +#define rq_for_each_integrity_segment(bvl, _rq, _iter) \ + __rq_for_each_bio(_iter.bio, _rq) \ + bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i) + +#endif /* BLK_DEV_INTEGRITY */ + #endif diff --git a/block/elevator.c b/block/elevator.c index 902dd1344d5..1f5bfe69602 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -86,6 +86,12 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) return 0; + /* + * only merge integrity protected bio into ditto rq + */ + if (bio_integrity(bio) != blk_integrity_rq(rq)) + return 0; + if (!elv_iosched_allow_merge(rq, bio)) return 0; diff --git a/fs/Makefile b/fs/Makefile index 1e7a11bd4da..277b079dec9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -19,6 +19,7 @@ else obj-y += no-block.o endif +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_INOTIFY_USER) += inotify_user.o obj-$(CONFIG_EPOLL) += eventpoll.o diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c new file mode 100644 index 00000000000..31b08878913 --- /dev/null +++ b/fs/bio-integrity.c @@ -0,0 +1,708 @@ +/* + * bio-integrity.c - bio data integrity extensions + * + * Copyright (C) 2007, 2008 Oracle Corporation + * Written by: Martin K. Petersen + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include +#include +#include +#include + +static struct kmem_cache *bio_integrity_slab __read_mostly; +static struct workqueue_struct *kintegrityd_wq; + +/** + * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * @bs: bio_set to allocate from + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs, struct bio_set *bs) +{ + struct bio_integrity_payload *bip; + struct bio_vec *iv; + unsigned long idx; + + BUG_ON(bio == NULL); + + bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "%s: could not alloc bip\n", __func__); + return NULL; + } + + memset(bip, 0, sizeof(*bip)); + + iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs); + if (unlikely(iv == NULL)) { + printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__); + mempool_free(bip, bs->bio_integrity_pool); + return NULL; + } + + bip->bip_pool = idx; + bip->bip_vec = iv; + bip->bip_bio = bio; + bio->bi_integrity = bip; + + return bip; +} +EXPORT_SYMBOL(bio_integrity_alloc_bioset); + +/** + * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) +{ + return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); +} +EXPORT_SYMBOL(bio_integrity_alloc); + +/** + * bio_integrity_free - Free bio integrity payload + * @bio: bio containing bip to be freed + * @bs: bio_set this bio was allocated from + * + * Description: Used to free the integrity portion of a bio. Usually + * called from bio_free(). + */ +void bio_integrity_free(struct bio *bio, struct bio_set *bs) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip == NULL); + + /* A cloned bio doesn't own the integrity metadata */ + if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL) + kfree(bip->bip_buf); + + mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); + mempool_free(bip, bs->bio_integrity_pool); + + bio->bi_integrity = NULL; +} +EXPORT_SYMBOL(bio_integrity_free); + +/** + * bio_integrity_add_page - Attach integrity metadata + * @bio: bio to update + * @page: page containing integrity metadata + * @len: number of bytes of integrity metadata in page + * @offset: start offset within page + * + * Description: Attach a page containing integrity metadata to bio. + */ +int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_vec *iv; + + if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) { + printk(KERN_ERR "%s: bip_vec full\n", __func__); + return 0; + } + + iv = bip_vec_idx(bip, bip->bip_vcnt); + BUG_ON(iv == NULL); + BUG_ON(iv->bv_page != NULL); + + iv->bv_page = page; + iv->bv_len = len; + iv->bv_offset = offset; + bip->bip_vcnt++; + + return len; +} +EXPORT_SYMBOL(bio_integrity_add_page); + +/** + * bio_integrity_enabled - Check whether integrity can be passed + * @bio: bio to check + * + * Description: Determines whether bio_integrity_prep() can be called + * on this bio or not. bio data direction and target device must be + * set prior to calling. The functions honors the write_generate and + * read_verify flags in sysfs. + */ +int bio_integrity_enabled(struct bio *bio) +{ + /* Already protected? */ + if (bio_integrity(bio)) + return 0; + + return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); +} +EXPORT_SYMBOL(bio_integrity_enabled); + +/** + * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto + * @bi: blk_integrity profile for device + * @sectors: Number of 512 sectors to convert + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the hardware + * sector size of the storage device. Convert the block layer sectors + * to physical sectors. + */ +static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, unsigned int sectors) +{ + /* At this point there are only 512b or 4096b DIF/EPP devices */ + if (bi->sector_size == 4096) + return sectors >>= 3; + + return sectors; +} + +/** + * bio_integrity_tag_size - Retrieve integrity tag space + * @bio: bio to inspect + * + * Description: Returns the maximum number of tag bytes that can be + * attached to this bio. Filesystems can use this to determine how + * much metadata to attach to an I/O. + */ +unsigned int bio_integrity_tag_size(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + + BUG_ON(bio->bi_size == 0); + + return bi->tag_size * (bio->bi_size / bi->sector_size); +} +EXPORT_SYMBOL(bio_integrity_tag_size); + +int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip->bip_buf == NULL); + + if (bi->tag_size == 0) + return -1; + + nr_sectors = bio_integrity_hw_sectors(bi, DIV_ROUND_UP(len, bi->tag_size)); + + if (nr_sectors * bi->tuple_size > bip->bip_size) { + printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", + __func__, nr_sectors * bi->tuple_size, bip->bip_size); + return -1; + } + + if (set) + bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + else + bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); + + return 0; +} + +/** + * bio_integrity_set_tag - Attach a tag buffer to a bio + * @bio: bio to attach buffer to + * @tag_buf: Pointer to a buffer containing tag data + * @len: Length of the included buffer + * + * Description: Use this function to tag a bio by leveraging the extra + * space provided by devices formatted with integrity protection. The + * size of the integrity buffer must be <= to the size reported by + * bio_integrity_tag_size(). + */ +int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != WRITE); + + return bio_integrity_tag(bio, tag_buf, len, 1); +} +EXPORT_SYMBOL(bio_integrity_set_tag); + +/** + * bio_integrity_get_tag - Retrieve a tag buffer from a bio + * @bio: bio to retrieve buffer from + * @tag_buf: Pointer to a buffer for the tag data + * @len: Length of the target buffer + * + * Description: Use this function to retrieve the tag buffer from a + * completed I/O. The size of the integrity buffer must be <= to the + * size reported by bio_integrity_tag_size(). + */ +int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) +{ + BUG_ON(bio_data_dir(bio) != READ); + + return bio_integrity_tag(bio, tag_buf, len, 0); +} +EXPORT_SYMBOL(bio_integrity_get_tag); + +/** + * bio_integrity_generate - Generate integrity metadata for a bio + * @bio: bio to generate integrity metadata for + * + * Description: Generates integrity metadata for a bio by calling the + * block device's generation callback function. The bio must have a + * bip attached with enough room to accommodate the generated + * integrity metadata. + */ +static void bio_integrity_generate(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_sector; + unsigned int i, sectors, total; + void *prot_buf = bio->bi_integrity->bip_buf; + + total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + bi->generate_fn(&bix); + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Description: Allocates a buffer for integrity metadata, maps the + * pages and attaches them to a bio. The bio must have data + * direction, target device and start sector set priot to calling. In + * the WRITE case, integrity metadata will be generated using the + * block device's integrity function. In the READ case, the buffer + * will be prepared for DMA and a suitable end_io handler set up. + */ +int bio_integrity_prep(struct bio *bio) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct request_queue *q; + void *buf; + unsigned long start, end; + unsigned int len, nr_pages; + unsigned int bytes, offset, i; + unsigned int sectors; + + bi = bdev_get_integrity(bio->bi_bdev); + q = bdev_get_queue(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bio_integrity(bio)); + + sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); + + /* Allocate kernel buffer for protection data */ + len = sectors * blk_integrity_tuple_size(bi); + buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); + if (unlikely(buf == NULL)) { + printk(KERN_ERR "could not allocate integrity buffer\n"); + return -EIO; + } + + end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ((unsigned long) buf) >> PAGE_SHIFT; + nr_pages = end - start; + + /* Allocate bio integrity payload and integrity vectors */ + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + if (unlikely(bip == NULL)) { + printk(KERN_ERR "could not allocate data integrity bioset\n"); + kfree(buf); + return -EIO; + } + + bip->bip_buf = buf; + bip->bip_size = len; + bip->bip_sector = bio->bi_sector; + + /* Map it */ + offset = offset_in_page(buf); + for (i = 0 ; i < nr_pages ; i++) { + int ret; + bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + ret = bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset); + + if (ret == 0) + return 0; + + if (ret < bytes) + break; + + buf += bytes; + len -= bytes; + offset = 0; + } + + /* Install custom I/O completion handler if read verify is enabled */ + if (bio_data_dir(bio) == READ) { + bip->bip_end_io = bio->bi_end_io; + bio->bi_end_io = bio_integrity_endio; + } + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + bio_integrity_generate(bio); + + return 0; +} +EXPORT_SYMBOL(bio_integrity_prep); + +/** + * bio_integrity_verify - Verify integrity metadata for a bio + * @bio: bio to verify + * + * Description: This function is called to verify the integrity of a + * bio. The data in the bio io_vec is compared to the integrity + * metadata returned by the HBA. + */ +static int bio_integrity_verify(struct bio *bio) +{ + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + struct blk_integrity_exchg bix; + struct bio_vec *bv; + sector_t sector = bio->bi_integrity->bip_sector; + unsigned int i, sectors, total, ret; + void *prot_buf = bio->bi_integrity->bip_buf; + + ret = total = 0; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; + bix.sector_size = bi->sector_size; + + bio_for_each_segment(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page, KM_USER0); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; + bix.prot_buf = prot_buf; + bix.sector = sector; + + ret = bi->verify_fn(&bix); + + if (ret) { + kunmap_atomic(kaddr, KM_USER0); + break; + } + + sectors = bv->bv_len / bi->sector_size; + sector += sectors; + prot_buf += sectors * bi->tuple_size; + total += sectors * bi->tuple_size; + BUG_ON(total > bio->bi_integrity->bip_size); + + kunmap_atomic(kaddr, KM_USER0); + } + + return ret; +} + +/** + * bio_integrity_verify_fn - Integrity I/O completion worker + * @work: Work struct stored in bio to be verified + * + * Description: This workqueue function is called to complete a READ + * request. The function verifies the transferred integrity metadata + * and then calls the original bio end_io function. + */ +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_payload *bip = + container_of(work, struct bio_integrity_payload, bip_work); + struct bio *bio = bip->bip_bio; + int error = bip->bip_error; + + if (bio_integrity_verify(bio)) { + clear_bit(BIO_UPTODATE, &bio->bi_flags); + error = -EIO; + } + + /* Restore original bio completion handler */ + bio->bi_end_io = bip->bip_end_io; + + if (bio->bi_end_io) + bio->bi_end_io(bio, error); +} + +/** + * bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * @error: Pointer to errno + * + * Description: Completion for integrity I/O + * + * Normally I/O completion is done in interrupt context. However, + * verifying I/O integrity is a time-consuming task which must be run + * in process context. This function postpones completion + * accordingly. + */ +void bio_integrity_endio(struct bio *bio, int error) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + + BUG_ON(bip->bip_bio != bio); + + bip->bip_error = error; + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bip->bip_work); +} +EXPORT_SYMBOL(bio_integrity_endio); + +/** + * bio_integrity_mark_head - Advance bip_vec skip bytes + * @bip: Integrity vector to advance + * @skip: Number of bytes to advance it + */ +void bio_integrity_mark_head(struct bio_integrity_payload *bip, unsigned int skip) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (skip == 0) { + bip->bip_idx = i; + return; + } else if (skip >= iv->bv_len) { + skip -= iv->bv_len; + } else { /* skip < iv->bv_len) */ + iv->bv_offset += skip; + iv->bv_len -= skip; + bip->bip_idx = i; + return; + } + } +} + +/** + * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long + * @bip: Integrity vector to truncate + * @len: New length of integrity vector + */ +void bio_integrity_mark_tail(struct bio_integrity_payload *bip, unsigned int len) +{ + struct bio_vec *iv; + unsigned int i; + + bip_for_each_vec(iv, bip, i) { + if (len == 0) { + bip->bip_vcnt = i; + return; + } else if (len >= iv->bv_len) { + len -= iv->bv_len; + } else { /* len < iv->bv_len) */ + iv->bv_len = len; + len = 0; + } + } +} + +/** + * bio_integrity_advance - Advance integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes that have been completed + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and advances the + * integrity vector accordingly. + */ +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + + nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9); + bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_advance); + +/** + * bio_integrity_trim - Trim integrity vector + * @bio: bio whose integrity vector to update + * @offset: offset to first data sector + * @sectors: number of data sectors + * + * Description: Used to trim the integrity vector in a cloned bio. + * The ivec will be advanced corresponding to 'offset' data sectors + * and the length will be truncated corresponding to 'len' data + * sectors. + */ +void bio_integrity_trim(struct bio *bio, unsigned int offset, unsigned int sectors) +{ + struct bio_integrity_payload *bip = bio->bi_integrity; + struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); + unsigned int nr_sectors; + + BUG_ON(bip == NULL); + BUG_ON(bi == NULL); + BUG_ON(!bio_flagged(bio, BIO_CLONED)); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + bip->bip_sector = bip->bip_sector + offset; + bio_integrity_mark_head(bip, offset * bi->tuple_size); + bio_integrity_mark_tail(bip, sectors * bi->tuple_size); +} +EXPORT_SYMBOL(bio_integrity_trim); + +/** + * bio_integrity_split - Split integrity metadata + * @bio: Protected bio + * @bp: Resulting bio_pair + * @sectors: Offset + * + * Description: Splits an integrity page into a bio_pair. + */ +void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) +{ + struct blk_integrity *bi; + struct bio_integrity_payload *bip = bio->bi_integrity; + unsigned int nr_sectors; + + if (bio_integrity(bio) == 0) + return; + + bi = bdev_get_integrity(bio->bi_bdev); + BUG_ON(bi == NULL); + BUG_ON(bip->bip_vcnt != 1); + + nr_sectors = bio_integrity_hw_sectors(bi, sectors); + + bp->bio1.bi_integrity = &bp->bip1; + bp->bio2.bi_integrity = &bp->bip2; + + bp->iv1 = bip->bip_vec[0]; + bp->iv2 = bip->bip_vec[0]; + + bp->bip1.bip_vec = &bp->iv1; + bp->bip2.bip_vec = &bp->iv2; + + bp->iv1.bv_len = sectors * bi->tuple_size; + bp->iv2.bv_offset += sectors * bi->tuple_size; + bp->iv2.bv_len -= sectors * bi->tuple_size; + + bp->bip1.bip_sector = bio->bi_integrity->bip_sector; + bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors; + + bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1; + bp->bip1.bip_idx = bp->bip2.bip_idx = 0; +} +EXPORT_SYMBOL(bio_integrity_split); + +/** + * bio_integrity_clone - Callback for cloning bios with integrity metadata + * @bio: New bio + * @bio_src: Original bio + * @bs: bio_set to allocate bip from + * + * Description: Called to allocate a bip when cloning a bio + */ +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, struct bio_set *bs) +{ + struct bio_integrity_payload *bip_src = bio_src->bi_integrity; + struct bio_integrity_payload *bip; + + BUG_ON(bip_src == NULL); + + bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); + + if (bip == NULL) + return -EIO; + + memcpy(bip->bip_vec, bip_src->bip_vec, + bip_src->bip_vcnt * sizeof(struct bio_vec)); + + bip->bip_sector = bip_src->bip_sector; + bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_idx = bip_src->bip_idx; + + return 0; +} +EXPORT_SYMBOL(bio_integrity_clone); + +int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, + bio_integrity_slab); + if (!bs->bio_integrity_pool) + return -1; + + return 0; +} +EXPORT_SYMBOL(bioset_integrity_create); + +void bioset_integrity_free(struct bio_set *bs) +{ + if (bs->bio_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); +} +EXPORT_SYMBOL(bioset_integrity_free); + +void __init bio_integrity_init_slab(void) +{ + bio_integrity_slab = KMEM_CACHE(bio_integrity_payload, + SLAB_HWCACHE_ALIGN|SLAB_PANIC); +} +EXPORT_SYMBOL(bio_integrity_init_slab); + +static int __init integrity_init(void) +{ + kintegrityd_wq = create_workqueue("kintegrityd"); + + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + + return 0; +} +subsys_initcall(integrity_init); diff --git a/fs/bio.c b/fs/bio.c index 7a6598abc96..7761c84c703 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -50,6 +50,11 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { */ struct bio_set *fs_bio_set; +unsigned int bvec_nr_vecs(unsigned short idx) +{ + return bvec_slabs[idx].nr_vecs; +} + struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) { struct bio_vec *bvl; @@ -91,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set) mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); } + if (bio_integrity(bio)) + bio_integrity_free(bio, bio_set); + mempool_free(bio, bio_set->bio_pool); } @@ -249,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) { struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); - if (b) { - b->bi_destructor = bio_fs_destructor; - __bio_clone(b, bio); + if (!b) + return NULL; + + b->bi_destructor = bio_fs_destructor; + __bio_clone(b, bio); + + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, fs_bio_set); + + if (ret < 0) + return NULL; } return b; @@ -1223,6 +1241,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) bp->bio1.bi_private = bi; bp->bio2.bi_private = pool; + if (bio_integrity(bi)) + bio_integrity_split(bi, bp, first_sectors); + return bp; } @@ -1264,6 +1285,7 @@ void bioset_free(struct bio_set *bs) if (bs->bio_pool) mempool_destroy(bs->bio_pool); + bioset_integrity_free(bs); biovec_free_pools(bs); kfree(bs); @@ -1280,6 +1302,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size) if (!bs->bio_pool) goto bad; + if (bioset_integrity_create(bs, bio_pool_size)) + goto bad; + if (!biovec_create_pools(bs, bvec_pool_size)) return bs; @@ -1306,6 +1331,7 @@ static int __init init_bio(void) { bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC); + bio_integrity_init_slab(); biovec_init_slabs(); fs_bio_set = bioset_create(BIO_POOL_SIZE, 2); diff --git a/include/linux/bio.h b/include/linux/bio.h index 49dfb3cb746..6bfc3e8d9d8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -64,6 +64,7 @@ struct bio_vec { struct bio_set; struct bio; +struct bio_integrity_payload; typedef void (bio_end_io_t) (struct bio *, int); typedef void (bio_destructor_t) (struct bio *); @@ -112,6 +113,9 @@ struct bio { atomic_t bi_cnt; /* pin count */ void *bi_private; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + struct bio_integrity_payload *bi_integrity; /* data integrity */ +#endif bio_destructor_t *bi_destructor; /* destructor */ }; @@ -271,6 +275,29 @@ static inline void *bio_data(struct bio *bio) */ #define bio_get(bio) atomic_inc(&(bio)->bi_cnt) +#if defined(CONFIG_BLK_DEV_INTEGRITY) +/* + * bio integrity payload + */ +struct bio_integrity_payload { + struct bio *bip_bio; /* parent bio */ + struct bio_vec *bip_vec; /* integrity data vector */ + + sector_t bip_sector; /* virtual start sector */ + + void *bip_buf; /* generated integrity data */ + bio_end_io_t *bip_end_io; /* saved I/O completion fn */ + + int bip_error; /* saved I/O error */ + unsigned int bip_size; + + unsigned short bip_pool; /* pool the ivec came from */ + unsigned short bip_vcnt; /* # of integrity bio_vecs */ + unsigned short bip_idx; /* current bip_vec index */ + + struct work_struct bip_work; /* I/O completion */ +}; +#endif /* CONFIG_BLK_DEV_INTEGRITY */ /* * A bio_pair is used when we need to split a bio. @@ -283,10 +310,14 @@ static inline void *bio_data(struct bio *bio) * in bio2.bi_private */ struct bio_pair { - struct bio bio1, bio2; - struct bio_vec bv1, bv2; - atomic_t cnt; - int error; + struct bio bio1, bio2; + struct bio_vec bv1, bv2; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + struct bio_integrity_payload bip1, bip2; + struct bio_vec iv1, iv2; +#endif + atomic_t cnt; + int error; }; extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors); @@ -334,6 +365,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *, extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); +extern unsigned int bvec_nr_vecs(unsigned short idx); /* * bio_set is used to allow other portions of the IO system to @@ -346,6 +378,9 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set struct bio_set { mempool_t *bio_pool; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + mempool_t *bio_integrity_pool; +#endif mempool_t *bvec_pools[BIOVEC_NR_POOLS]; }; @@ -410,5 +445,56 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, __bio_kmap_irq((bio), (bio)->bi_idx, (flags)) #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +#define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) +#define bip_vec(bip) bip_vec_idx(bip, 0) + +#define __bip_for_each_vec(bvl, bip, i, start_idx) \ + for (bvl = bip_vec_idx((bip), (start_idx)), i = (start_idx); \ + i < (bip)->bip_vcnt; \ + bvl++, i++) + +#define bip_for_each_vec(bvl, bip, i) \ + __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) + +#define bio_integrity(bio) ((bio)->bi_integrity ? 1 : 0) + +extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); +extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); +extern void bio_integrity_free(struct bio *, struct bio_set *); +extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); +extern int bio_integrity_enabled(struct bio *bio); +extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); +extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); +extern int bio_integrity_prep(struct bio *); +extern void bio_integrity_endio(struct bio *, int); +extern void bio_integrity_advance(struct bio *, unsigned int); +extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); +extern void bio_integrity_split(struct bio *, struct bio_pair *, int); +extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *); +extern int bioset_integrity_create(struct bio_set *, int); +extern void bioset_integrity_free(struct bio_set *); +extern void bio_integrity_init_slab(void); + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +#define bio_integrity(a) (0) +#define bioset_integrity_create(a, b) (0) +#define bio_integrity_prep(a) (0) +#define bio_integrity_enabled(a) (0) +#define bio_integrity_clone(a, b, c) (0) +#define bioset_integrity_free(a) do { } while (0) +#define bio_integrity_free(a, b) do { } while (0) +#define bio_integrity_endio(a, b) do { } while (0) +#define bio_integrity_advance(a, b) do { } while (0) +#define bio_integrity_trim(a, b, c) do { } while (0) +#define bio_integrity_split(a, b, c) do { } while (0) +#define bio_integrity_set_tag(a, b, c) do { } while (0) +#define bio_integrity_get_tag(a, b, c) do { } while (0) +#define bio_integrity_init_slab(a) do { } while (0) + +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + #endif /* CONFIG_BLOCK */ #endif /* __LINUX_BIO_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a3da671713..4a9ed45270f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -112,6 +112,7 @@ enum rq_flag_bits { __REQ_ALLOCED, /* request came from our alloc pool */ __REQ_RW_META, /* metadata io request */ __REQ_COPY_USER, /* contains copies of user pages */ + __REQ_INTEGRITY, /* integrity metadata has been remapped */ __REQ_NR_BITS, /* stops here */ }; @@ -134,6 +135,7 @@ enum rq_flag_bits { #define REQ_ALLOCED (1 << __REQ_ALLOCED) #define REQ_RW_META (1 << __REQ_RW_META) #define REQ_COPY_USER (1 << __REQ_COPY_USER) +#define REQ_INTEGRITY (1 << __REQ_INTEGRITY) #define BLK_MAX_CDB 16 @@ -865,6 +867,109 @@ void kblockd_flush_work(struct work_struct *work); MODULE_ALIAS("block-major-" __stringify(major) "-*") +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +#define INTEGRITY_FLAG_READ 1 /* verify data integrity on read */ +#define INTEGRITY_FLAG_WRITE 2 /* generate data integrity on write */ + +struct blk_integrity_exchg { + void *prot_buf; + void *data_buf; + sector_t sector; + unsigned int data_size; + unsigned short sector_size; + const char *disk_name; +}; + +typedef void (integrity_gen_fn) (struct blk_integrity_exchg *); +typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *); +typedef void (integrity_set_tag_fn) (void *, void *, unsigned int); +typedef void (integrity_get_tag_fn) (void *, void *, unsigned int); + +struct blk_integrity { + integrity_gen_fn *generate_fn; + integrity_vrfy_fn *verify_fn; + integrity_set_tag_fn *set_tag_fn; + integrity_get_tag_fn *get_tag_fn; + + unsigned short flags; + unsigned short tuple_size; + unsigned short sector_size; + unsigned short tag_size; + + const char *name; + + struct kobject kobj; +}; + +extern int blk_integrity_register(struct gendisk *, struct blk_integrity *); +extern void blk_integrity_unregister(struct gendisk *); +extern int blk_integrity_compare(struct block_device *, struct block_device *); +extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); +extern int blk_rq_count_integrity_sg(struct request *); + +static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) +{ + if (bi) + return bi->tuple_size; + + return 0; +} + +static inline struct blk_integrity *bdev_get_integrity(struct block_device *bdev) +{ + return bdev->bd_disk->integrity; +} + +static inline unsigned int bdev_get_tag_size(struct block_device *bdev) +{ + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi) + return bi->tag_size; + + return 0; +} + +static inline int bdev_integrity_enabled(struct block_device *bdev, int rw) +{ + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi == NULL) + return 0; + + if (rw == READ && bi->verify_fn != NULL && + test_bit(INTEGRITY_FLAG_READ, &bi->flags)) + return 1; + + if (rw == WRITE && bi->generate_fn != NULL && + test_bit(INTEGRITY_FLAG_WRITE, &bi->flags)) + return 1; + + return 0; +} + +static inline int blk_integrity_rq(struct request *rq) +{ + BUG_ON(rq->bio == NULL); + + return bio_integrity(rq->bio); +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ + +#define blk_integrity_rq(rq) (0) +#define blk_rq_count_integrity_sg(a) (0) +#define blk_rq_map_integrity_sg(a, b) (0) +#define bdev_get_integrity(a) (0) +#define bdev_get_tag_size(a) (0) +#define blk_integrity_compare(a, b) (0) +#define blk_integrity_register(a, b) (0) +#define blk_integrity_unregister(a) do { } while (0); + +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + + #else /* CONFIG_BLOCK */ /* * stubs for when the block layer is configured out diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ae7aec3cabe..524ec96f5a2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -141,6 +141,9 @@ struct gendisk { struct disk_stats dkstats; #endif struct work_struct async_notify; +#ifdef CONFIG_BLK_DEV_INTEGRITY + struct blk_integrity *integrity; +#endif }; /* -- cgit v1.2.3-70-g09d2 From da9cbc87395308a21465bd25441297bbba0477e1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Jun 2008 20:42:08 +0200 Subject: block: blkdev.h cleanup, move iocontext stuff to iocontext.h Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 17 ----------------- include/linux/iocontext.h | 18 ++++++++++++++++++ kernel/exit.c | 1 + kernel/fork.c | 1 + 4 files changed, 20 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4a9ed45270f..443df75d2cd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -33,12 +33,6 @@ struct sg_io_hdr; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ -int put_io_context(struct io_context *ioc); -void exit_io_context(void); -struct io_context *get_io_context(gfp_t gfp_flags, int node); -struct io_context *alloc_io_context(gfp_t gfp_flags, int node); -void copy_io_context(struct io_context **pdst, struct io_context **psrc); - struct request; typedef void (rq_end_io_fn)(struct request *, int); @@ -981,17 +975,6 @@ static inline long nr_blockdev_pages(void) return 0; } -static inline void exit_io_context(void) -{ -} - -struct io_context; -static inline int put_io_context(struct io_context *ioc) -{ - return 1; -} - - #endif /* CONFIG_BLOCK */ #endif diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 2b7a1187cb2..08b987bccf8 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -99,4 +99,22 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc) return NULL; } +#ifdef CONFIG_BLOCK +int put_io_context(struct io_context *ioc); +void exit_io_context(void); +struct io_context *get_io_context(gfp_t gfp_flags, int node); +struct io_context *alloc_io_context(gfp_t gfp_flags, int node); +void copy_io_context(struct io_context **pdst, struct io_context **psrc); +#else +static inline void exit_io_context(void) +{ +} + +struct io_context; +static inline int put_io_context(struct io_context *ioc) +{ + return 1; +} +#endif + #endif diff --git a/kernel/exit.c b/kernel/exit.c index 8f6185e69b6..ceb25878283 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 19908b26cf8..b71ccd09fc8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 6e2401ad6f33de15ff00f78b88159f00a14f3b35 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Jun 2008 10:15:02 +0200 Subject: block: integrity cleanups - No need to check for NULL bio, we'll get an immediate oops anyway. - Make bio_integrity() a proper function. Signed-off-by: Jens Axboe --- include/linux/bio.h | 9 ++++++++- include/linux/blkdev.h | 4 ---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 6bfc3e8d9d8..0933a14e641 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -458,7 +458,14 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, #define bip_for_each_vec(bvl, bip, i) \ __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) -#define bio_integrity(bio) ((bio)->bi_integrity ? 1 : 0) +static inline int bio_integrity(struct bio *bio) +{ +#if defined(CONFIG_BLK_DEV_INTEGRITY) + return bio->bi_integrity != NULL; +#else + return 0; +#endif +} extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 443df75d2cd..d3ae9ad9721 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -860,7 +860,6 @@ void kblockd_flush_work(struct work_struct *work); #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ MODULE_ALIAS("block-major-" __stringify(major) "-*") - #if defined(CONFIG_BLK_DEV_INTEGRITY) #define INTEGRITY_FLAG_READ 1 /* verify data integrity on read */ @@ -945,8 +944,6 @@ static inline int bdev_integrity_enabled(struct block_device *bdev, int rw) static inline int blk_integrity_rq(struct request *rq) { - BUG_ON(rq->bio == NULL); - return bio_integrity(rq->bio); } @@ -963,7 +960,6 @@ static inline int blk_integrity_rq(struct request *rq) #endif /* CONFIG_BLK_DEV_INTEGRITY */ - #else /* CONFIG_BLOCK */ /* * stubs for when the block layer is configured out -- cgit v1.2.3-70-g09d2 From 0b07de85a76e1346e675f0e98437378932473df7 Mon Sep 17 00:00:00 2001 From: Adel Gadllah Date: Thu, 26 Jun 2008 13:48:27 +0200 Subject: allow userspace to modify scsi command filter on per device basis This patch exports the per-gendisk command filter to user space through sysfs, so it can be changed by the system administrator. All users of the old cmd filter have been converted to use the new one. Original patch from Peter Jones. Signed-off-by: Adel Gadllah Signed-off-by: Peter Jones Signed-off-by: Jens Axboe --- block/Makefile | 3 +- block/bsg.c | 38 ++++-- block/cmd-filter.c | 325 +++++++++++++++++++++++++++++++++++++++++++++++++ block/genhd.c | 2 + block/scsi_ioctl.c | 121 +----------------- drivers/scsi/sg.c | 40 ++---- include/linux/blkdev.h | 10 +- include/linux/genhd.h | 9 ++ 8 files changed, 389 insertions(+), 159 deletions(-) create mode 100644 block/cmd-filter.c (limited to 'include/linux') diff --git a/block/Makefile b/block/Makefile index 045f7b62e4b..208000b0750 100644 --- a/block/Makefile +++ b/block/Makefile @@ -4,7 +4,8 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o + blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o \ + cmd-filter.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o diff --git a/block/bsg.c b/block/bsg.c index f0b7cd34321..439940c3a1f 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -44,11 +44,12 @@ struct bsg_device { char name[BUS_ID_SIZE]; int max_queue; unsigned long flags; + struct blk_scsi_cmd_filter *cmd_filter; + mode_t *f_mode; }; enum { BSG_F_BLOCK = 1, - BSG_F_WRITE_PERM = 2, }; #define BSG_DEFAULT_CMDS 64 @@ -172,7 +173,7 @@ unlock: } static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, - struct sg_io_v4 *hdr, int has_write_perm) + struct sg_io_v4 *hdr, struct bsg_device *bd) { if (hdr->request_len > BLK_MAX_CDB) { rq->cmd = kzalloc(hdr->request_len, GFP_KERNEL); @@ -185,7 +186,8 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, return -EFAULT; if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { - if (blk_verify_command(rq->cmd, has_write_perm)) + if (blk_cmd_filter_verify_command(bd->cmd_filter, rq->cmd, + bd->f_mode)) return -EPERM; } else if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -263,8 +265,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr) rq = blk_get_request(q, rw, GFP_KERNEL); if (!rq) return ERR_PTR(-ENOMEM); - ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, test_bit(BSG_F_WRITE_PERM, - &bd->flags)); + ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd); if (ret) goto out; @@ -566,12 +567,23 @@ static inline void bsg_set_block(struct bsg_device *bd, struct file *file) set_bit(BSG_F_BLOCK, &bd->flags); } -static inline void bsg_set_write_perm(struct bsg_device *bd, struct file *file) +static void bsg_set_cmd_filter(struct bsg_device *bd, + struct file *file) { - if (file->f_mode & FMODE_WRITE) - set_bit(BSG_F_WRITE_PERM, &bd->flags); - else - clear_bit(BSG_F_WRITE_PERM, &bd->flags); + struct inode *inode; + struct gendisk *disk; + + if (!file) + return; + + inode = file->f_dentry->d_inode; + if (!inode) + return; + + disk = inode->i_bdev->bd_disk; + + bd->cmd_filter = &disk->cmd_filter; + bd->f_mode = &file->f_mode; } /* @@ -595,6 +607,8 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) dprintk("%s: read %Zd bytes\n", bd->name, count); bsg_set_block(bd, file); + bsg_set_cmd_filter(bd, file); + bytes_read = 0; ret = __bsg_read(buf, count, bd, NULL, &bytes_read); *ppos = bytes_read; @@ -668,7 +682,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) dprintk("%s: write %Zd bytes\n", bd->name, count); bsg_set_block(bd, file); - bsg_set_write_perm(bd, file); + bsg_set_cmd_filter(bd, file); bytes_written = 0; ret = __bsg_write(bd, buf, count, &bytes_written); @@ -771,7 +785,9 @@ static struct bsg_device *bsg_add_device(struct inode *inode, } bd->queue = rq; + bsg_set_block(bd, file); + bsg_set_cmd_filter(bd, file); atomic_set(&bd->ref_count, 1); mutex_lock(&bsg_mutex); diff --git a/block/cmd-filter.c b/block/cmd-filter.c new file mode 100644 index 00000000000..35e327ceaa9 --- /dev/null +++ b/block/cmd-filter.c @@ -0,0 +1,325 @@ +/* + * Copyright 2004 Peter M. Jones + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +int blk_cmd_filter_verify_command(struct blk_scsi_cmd_filter *filter, + unsigned char *cmd, mode_t *f_mode) +{ + /* root can do any command. */ + if (capable(CAP_SYS_RAWIO)) + return 0; + + /* if there's no filter set, assume we're filtering everything out */ + if (!filter) + return -EPERM; + + /* Anybody who can open the device can do a read-safe command */ + if (test_bit(cmd[0], filter->read_ok)) + return 0; + + /* Write-safe commands require a writable open */ + if (test_bit(cmd[0], filter->write_ok) && (*f_mode & FMODE_WRITE)) + return 0; + + return -EPERM; +} +EXPORT_SYMBOL(blk_cmd_filter_verify_command); + +int blk_verify_command(struct file *file, unsigned char *cmd) +{ + struct gendisk *disk; + struct inode *inode; + + if (!file) + return -EINVAL; + + inode = file->f_dentry->d_inode; + if (!inode) + return -EINVAL; + + disk = inode->i_bdev->bd_disk; + + return blk_cmd_filter_verify_command(&disk->cmd_filter, + cmd, &file->f_mode); +} +EXPORT_SYMBOL(blk_verify_command); + +/* and now, the sysfs stuff */ +static ssize_t rcf_cmds_show(struct blk_scsi_cmd_filter *filter, char *page, + int rw) +{ + char *npage = page; + unsigned long *okbits; + int i; + + if (rw == READ) + okbits = filter->read_ok; + else + okbits = filter->write_ok; + + for (i = 0; i < BLK_SCSI_MAX_CMDS; i++) { + if (test_bit(i, okbits)) { + sprintf(npage, "%02x", i); + npage += 2; + if (i < BLK_SCSI_MAX_CMDS - 1) + sprintf(npage++, " "); + } + } + + if (npage != page) + npage += sprintf(npage, "\n"); + + return npage - page; +} + +static ssize_t rcf_readcmds_show(struct blk_scsi_cmd_filter *filter, char *page) +{ + return rcf_cmds_show(filter, page, READ); +} + +static ssize_t rcf_writecmds_show(struct blk_scsi_cmd_filter *filter, + char *page) +{ + return rcf_cmds_show(filter, page, WRITE); +} + +static ssize_t rcf_cmds_store(struct blk_scsi_cmd_filter *filter, + const char *page, size_t count, int rw) +{ + ssize_t ret = 0; + unsigned long okbits[BLK_SCSI_CMD_PER_LONG], *target_okbits; + int cmd, status, len; + substring_t ss; + + memset(&okbits, 0, sizeof(okbits)); + + for (len = strlen(page); len > 0; len -= 3) { + if (len < 2) + break; + ss.from = (char *) page + ret; + ss.to = (char *) page + ret + 2; + ret += 3; + status = match_hex(&ss, &cmd); + /* either of these cases means invalid input, so do nothing. */ + if (status || cmd >= BLK_SCSI_MAX_CMDS) + return -EINVAL; + + __set_bit(cmd, okbits); + } + + if (rw == READ) + target_okbits = filter->read_ok; + else + target_okbits = filter->write_ok; + + memmove(target_okbits, okbits, sizeof(okbits)); + return count; +} + +static ssize_t rcf_readcmds_store(struct blk_scsi_cmd_filter *filter, + const char *page, size_t count) +{ + return rcf_cmds_store(filter, page, count, READ); +} + +static ssize_t rcf_writecmds_store(struct blk_scsi_cmd_filter *filter, + const char *page, size_t count) +{ + return rcf_cmds_store(filter, page, count, WRITE); +} + +struct rcf_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_scsi_cmd_filter *, char *); + ssize_t (*store)(struct blk_scsi_cmd_filter *, const char *, size_t); +}; + +static struct rcf_sysfs_entry rcf_readcmds_entry = { + .attr = { .name = "read_table", .mode = S_IRUGO | S_IWUSR }, + .show = rcf_readcmds_show, + .store = rcf_readcmds_store, +}; + +static struct rcf_sysfs_entry rcf_writecmds_entry = { + .attr = {.name = "write_table", .mode = S_IRUGO | S_IWUSR }, + .show = rcf_writecmds_show, + .store = rcf_writecmds_store, +}; + +static struct attribute *default_attrs[] = { + &rcf_readcmds_entry.attr, + &rcf_writecmds_entry.attr, + NULL, +}; + +#define to_rcf(atr) container_of((atr), struct rcf_sysfs_entry, attr) + +static ssize_t +rcf_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct rcf_sysfs_entry *entry = to_rcf(attr); + struct blk_scsi_cmd_filter *filter; + + filter = container_of(kobj, struct blk_scsi_cmd_filter, kobj); + if (entry->show) + return entry->show(filter, page); + + return 0; +} + +static ssize_t +rcf_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct rcf_sysfs_entry *entry = to_rcf(attr); + struct blk_scsi_cmd_filter *filter; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (!entry->store) + return -EINVAL; + + filter = container_of(kobj, struct blk_scsi_cmd_filter, kobj); + return entry->store(filter, page, length); +} + +static struct sysfs_ops rcf_sysfs_ops = { + .show = rcf_attr_show, + .store = rcf_attr_store, +}; + +static struct kobj_type rcf_ktype = { + .sysfs_ops = &rcf_sysfs_ops, + .default_attrs = default_attrs, +}; + +static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter) +{ + /* Basic read-only commands */ + __set_bit(TEST_UNIT_READY, filter->read_ok); + __set_bit(REQUEST_SENSE, filter->read_ok); + __set_bit(READ_6, filter->read_ok); + __set_bit(READ_10, filter->read_ok); + __set_bit(READ_12, filter->read_ok); + __set_bit(READ_16, filter->read_ok); + __set_bit(READ_BUFFER, filter->read_ok); + __set_bit(READ_DEFECT_DATA, filter->read_ok); + __set_bit(READ_LONG, filter->read_ok); + __set_bit(INQUIRY, filter->read_ok); + __set_bit(MODE_SENSE, filter->read_ok); + __set_bit(MODE_SENSE_10, filter->read_ok); + __set_bit(LOG_SENSE, filter->read_ok); + __set_bit(START_STOP, filter->read_ok); + __set_bit(GPCMD_VERIFY_10, filter->read_ok); + __set_bit(VERIFY_16, filter->read_ok); + __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); + + /* Audio CD commands */ + __set_bit(GPCMD_PLAY_CD, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok); + __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok); + + /* CD/DVD data reading */ + __set_bit(GPCMD_READ_CD, filter->read_ok); + __set_bit(GPCMD_READ_CD_MSF, filter->read_ok); + __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok); + __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok); + __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok); + __set_bit(GPCMD_READ_HEADER, filter->read_ok); + __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok); + __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok); + __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok); + __set_bit(GPCMD_REPORT_KEY, filter->read_ok); + __set_bit(GPCMD_SCAN, filter->read_ok); + __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok); + __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok); + __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok); + __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok); + __set_bit(GPCMD_SEEK, filter->read_ok); + __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok); + + /* Basic writing commands */ + __set_bit(WRITE_6, filter->write_ok); + __set_bit(WRITE_10, filter->write_ok); + __set_bit(WRITE_VERIFY, filter->write_ok); + __set_bit(WRITE_12, filter->write_ok); + __set_bit(WRITE_VERIFY_12, filter->write_ok); + __set_bit(WRITE_16, filter->write_ok); + __set_bit(WRITE_LONG, filter->write_ok); + __set_bit(WRITE_LONG_2, filter->write_ok); + __set_bit(ERASE, filter->write_ok); + __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); + __set_bit(MODE_SELECT, filter->write_ok); + __set_bit(LOG_SELECT, filter->write_ok); + __set_bit(GPCMD_BLANK, filter->write_ok); + __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok); + __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok); + __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok); + __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok); + __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok); + __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok); + __set_bit(GPCMD_SEND_EVENT, filter->write_ok); + __set_bit(GPCMD_SEND_KEY, filter->write_ok); + __set_bit(GPCMD_SEND_OPC, filter->write_ok); + __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok); + __set_bit(GPCMD_SET_SPEED, filter->write_ok); + __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok); + __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok); + __set_bit(GPCMD_SET_STREAMING, filter->write_ok); +} + +int blk_register_filter(struct gendisk *disk) +{ + int ret; + struct blk_scsi_cmd_filter *filter = &disk->cmd_filter; + struct kobject *parent = kobject_get(disk->holder_dir->parent); + + if (!parent) + return -ENODEV; + + ret = kobject_init_and_add(&filter->kobj, &rcf_ktype, parent, + "%s", "cmd_filter"); + + if (ret < 0) + return ret; + + rcf_set_defaults(filter); + return 0; +} + +void blk_unregister_filter(struct gendisk *disk) +{ + struct blk_scsi_cmd_filter *filter = &disk->cmd_filter; + + kobject_put(&filter->kobj); + kobject_put(disk->holder_dir->parent); +} + diff --git a/block/genhd.c b/block/genhd.c index 43e468ee599..9074f384b09 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -189,6 +189,7 @@ void add_disk(struct gendisk *disk) disk->minors, NULL, exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); + blk_register_filter(disk); bdi = &disk->queue->backing_dev_info; bdi_register_dev(bdi, MKDEV(disk->major, disk->first_minor)); @@ -200,6 +201,7 @@ EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ void unlink_gendisk(struct gendisk *disk) { + blk_unregister_filter(disk); sysfs_remove_link(&disk->dev.kobj, "bdi"); bdi_unregister(&disk->queue->backing_dev_info); blk_unregister_queue(disk); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 78199c08ec9..c5b9bcfc0a6 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -105,120 +105,12 @@ static int sg_emulated_host(struct request_queue *q, int __user *p) return put_user(1, p); } -#define CMD_READ_SAFE 0x01 -#define CMD_WRITE_SAFE 0x02 -#define CMD_WARNED 0x04 -#define safe_for_read(cmd) [cmd] = CMD_READ_SAFE -#define safe_for_write(cmd) [cmd] = CMD_WRITE_SAFE - -int blk_verify_command(unsigned char *cmd, int has_write_perm) -{ - static unsigned char cmd_type[256] = { - - /* Basic read-only commands */ - safe_for_read(TEST_UNIT_READY), - safe_for_read(REQUEST_SENSE), - safe_for_read(READ_6), - safe_for_read(READ_10), - safe_for_read(READ_12), - safe_for_read(READ_16), - safe_for_read(READ_BUFFER), - safe_for_read(READ_DEFECT_DATA), - safe_for_read(READ_LONG), - safe_for_read(INQUIRY), - safe_for_read(MODE_SENSE), - safe_for_read(MODE_SENSE_10), - safe_for_read(LOG_SENSE), - safe_for_read(START_STOP), - safe_for_read(GPCMD_VERIFY_10), - safe_for_read(VERIFY_16), - - /* Audio CD commands */ - safe_for_read(GPCMD_PLAY_CD), - safe_for_read(GPCMD_PLAY_AUDIO_10), - safe_for_read(GPCMD_PLAY_AUDIO_MSF), - safe_for_read(GPCMD_PLAY_AUDIO_TI), - safe_for_read(GPCMD_PAUSE_RESUME), - - /* CD/DVD data reading */ - safe_for_read(GPCMD_READ_BUFFER_CAPACITY), - safe_for_read(GPCMD_READ_CD), - safe_for_read(GPCMD_READ_CD_MSF), - safe_for_read(GPCMD_READ_DISC_INFO), - safe_for_read(GPCMD_READ_CDVD_CAPACITY), - safe_for_read(GPCMD_READ_DVD_STRUCTURE), - safe_for_read(GPCMD_READ_HEADER), - safe_for_read(GPCMD_READ_TRACK_RZONE_INFO), - safe_for_read(GPCMD_READ_SUBCHANNEL), - safe_for_read(GPCMD_READ_TOC_PMA_ATIP), - safe_for_read(GPCMD_REPORT_KEY), - safe_for_read(GPCMD_SCAN), - safe_for_read(GPCMD_GET_CONFIGURATION), - safe_for_read(GPCMD_READ_FORMAT_CAPACITIES), - safe_for_read(GPCMD_GET_EVENT_STATUS_NOTIFICATION), - safe_for_read(GPCMD_GET_PERFORMANCE), - safe_for_read(GPCMD_SEEK), - safe_for_read(GPCMD_STOP_PLAY_SCAN), - - /* Basic writing commands */ - safe_for_write(WRITE_6), - safe_for_write(WRITE_10), - safe_for_write(WRITE_VERIFY), - safe_for_write(WRITE_12), - safe_for_write(WRITE_VERIFY_12), - safe_for_write(WRITE_16), - safe_for_write(WRITE_LONG), - safe_for_write(WRITE_LONG_2), - safe_for_write(ERASE), - safe_for_write(GPCMD_MODE_SELECT_10), - safe_for_write(MODE_SELECT), - safe_for_write(LOG_SELECT), - safe_for_write(GPCMD_BLANK), - safe_for_write(GPCMD_CLOSE_TRACK), - safe_for_write(GPCMD_FLUSH_CACHE), - safe_for_write(GPCMD_FORMAT_UNIT), - safe_for_write(GPCMD_REPAIR_RZONE_TRACK), - safe_for_write(GPCMD_RESERVE_RZONE_TRACK), - safe_for_write(GPCMD_SEND_DVD_STRUCTURE), - safe_for_write(GPCMD_SEND_EVENT), - safe_for_write(GPCMD_SEND_KEY), - safe_for_write(GPCMD_SEND_OPC), - safe_for_write(GPCMD_SEND_CUE_SHEET), - safe_for_write(GPCMD_SET_SPEED), - safe_for_write(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL), - safe_for_write(GPCMD_LOAD_UNLOAD), - safe_for_write(GPCMD_SET_STREAMING), - }; - unsigned char type = cmd_type[cmd[0]]; - - /* Anybody who can open the device can do a read-safe command */ - if (type & CMD_READ_SAFE) - return 0; - - /* Write-safe commands just require a writable open.. */ - if ((type & CMD_WRITE_SAFE) && has_write_perm) - return 0; - - /* And root can do any command.. */ - if (capable(CAP_SYS_RAWIO)) - return 0; - - if (!type) { - cmd_type[cmd[0]] = CMD_WARNED; - printk(KERN_WARNING "scsi: unknown opcode 0x%02x\n", cmd[0]); - } - - /* Otherwise fail it with an "Operation not permitted" */ - return -EPERM; -} -EXPORT_SYMBOL_GPL(blk_verify_command); - static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, - struct sg_io_hdr *hdr, int has_write_perm) + struct sg_io_hdr *hdr, struct file *file) { if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; - if (blk_verify_command(rq->cmd, has_write_perm)) + if (blk_verify_command(file, rq->cmd)) return -EPERM; /* @@ -287,7 +179,7 @@ static int sg_io(struct file *file, struct request_queue *q, struct gendisk *bd_disk, struct sg_io_hdr *hdr) { unsigned long start_time; - int writing = 0, ret = 0, has_write_perm = 0; + int writing = 0, ret = 0; struct request *rq; char sense[SCSI_SENSE_BUFFERSIZE]; struct bio *bio; @@ -316,10 +208,7 @@ static int sg_io(struct file *file, struct request_queue *q, if (!rq) return -ENOMEM; - if (file) - has_write_perm = file->f_mode & FMODE_WRITE; - - if (blk_fill_sghdr_rq(q, rq, hdr, has_write_perm)) { + if (blk_fill_sghdr_rq(q, rq, hdr, file)) { blk_put_request(rq); return -EFAULT; } @@ -451,7 +340,7 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q, if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) goto error; - err = blk_verify_command(rq->cmd, file->f_mode & FMODE_WRITE); + err = blk_verify_command(file, rq->cmd); if (err) goto error; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ea0edd1b2e7..f7abccaffae 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -182,8 +182,9 @@ static int sg_build_sgat(Sg_scatter_hold * schp, const Sg_fd * sfp, int tablesize); static ssize_t sg_new_read(Sg_fd * sfp, char __user *buf, size_t count, Sg_request * srp); -static ssize_t sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count, - int blocking, int read_only, Sg_request ** o_srp); +static ssize_t sg_new_write(Sg_fd *sfp, struct file *file, + const char __user *buf, size_t count, int blocking, + int read_only, Sg_request **o_srp); static int sg_common_write(Sg_fd * sfp, Sg_request * srp, unsigned char *cmnd, int timeout, int blocking); static int sg_u_iovec(sg_io_hdr_t * hp, int sg_num, int ind, @@ -204,7 +205,6 @@ static Sg_request *sg_get_rq_mark(Sg_fd * sfp, int pack_id); static Sg_request *sg_add_request(Sg_fd * sfp); static int sg_remove_request(Sg_fd * sfp, Sg_request * srp); static int sg_res_in_use(Sg_fd * sfp); -static int sg_allow_access(unsigned char opcode, char dev_type); static int sg_build_direct(Sg_request * srp, Sg_fd * sfp, int dxfer_len); static Sg_device *sg_get_dev(int dev); #ifdef CONFIG_SCSI_PROC_FS @@ -544,7 +544,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) return -EFAULT; blocking = !(filp->f_flags & O_NONBLOCK); if (old_hdr.reply_len < 0) - return sg_new_write(sfp, buf, count, blocking, 0, NULL); + return sg_new_write(sfp, filp, buf, count, blocking, 0, NULL); if (count < (SZ_SG_HEADER + 6)) return -EIO; /* The minimum scsi command length is 6 bytes. */ @@ -621,8 +621,9 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) } static ssize_t -sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count, - int blocking, int read_only, Sg_request ** o_srp) +sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf, + size_t count, int blocking, int read_only, + Sg_request **o_srp) { int k; Sg_request *srp; @@ -678,8 +679,7 @@ sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count, sg_remove_request(sfp, srp); return -EFAULT; } - if (read_only && - (!sg_allow_access(cmnd[0], sfp->parentdp->device->type))) { + if (read_only && (!blk_verify_command(file, cmnd))) { sg_remove_request(sfp, srp); return -EPERM; } @@ -799,7 +799,7 @@ sg_ioctl(struct inode *inode, struct file *filp, if (!access_ok(VERIFY_WRITE, p, SZ_SG_IO_HDR)) return -EFAULT; result = - sg_new_write(sfp, p, SZ_SG_IO_HDR, + sg_new_write(sfp, filp, p, SZ_SG_IO_HDR, blocking, read_only, &srp); if (result < 0) return result; @@ -1048,7 +1048,7 @@ sg_ioctl(struct inode *inode, struct file *filp, if (copy_from_user(&opcode, siocp->data, 1)) return -EFAULT; - if (!sg_allow_access(opcode, sdp->device->type)) + if (!blk_verify_command(filp, &opcode)) return -EPERM; } return sg_scsi_ioctl(filp, sdp->device->request_queue, NULL, p); @@ -2506,26 +2506,6 @@ sg_page_free(struct page *page, int size) #define MAINTENANCE_IN_CMD 0xa3 #endif -static unsigned char allow_ops[] = { TEST_UNIT_READY, REQUEST_SENSE, - INQUIRY, READ_CAPACITY, READ_BUFFER, READ_6, READ_10, READ_12, - READ_16, MODE_SENSE, MODE_SENSE_10, LOG_SENSE, REPORT_LUNS, - SERVICE_ACTION_IN, RECEIVE_DIAGNOSTIC, READ_LONG, MAINTENANCE_IN_CMD -}; - -static int -sg_allow_access(unsigned char opcode, char dev_type) -{ - int k; - - if (TYPE_SCANNER == dev_type) /* TYPE_ROM maybe burner */ - return 1; - for (k = 0; k < sizeof (allow_ops); ++k) { - if (opcode == allow_ops[k]) - return 1; - } - return 0; -} - #ifdef CONFIG_SCSI_PROC_FS static int sg_idr_max_id(int id, void *p, void *data) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d3ae9ad9721..a842b776d09 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -671,7 +671,6 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); -extern int blk_verify_command(unsigned char *, int); extern void blk_unplug(struct request_queue *q); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) @@ -797,6 +796,15 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, extern int blkdev_issue_flush(struct block_device *, sector_t *); +/* +* command filter functions +*/ +extern int blk_verify_command(struct file *file, unsigned char *cmd); +extern int blk_cmd_filter_verify_command(struct blk_scsi_cmd_filter *filter, + unsigned char *cmd, mode_t *f_mode); +extern int blk_register_filter(struct gendisk *disk); +extern void blk_unregister_filter(struct gendisk *disk); + #define MAX_PHYS_SEGMENTS 128 #define MAX_HW_SEGMENTS 128 #define SAFE_MAX_SECTORS 255 diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 524ec96f5a2..e8787417f65 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -110,6 +110,14 @@ struct hd_struct { #define GENHD_FL_SUPPRESS_PARTITION_INFO 32 #define GENHD_FL_FAIL 64 +#define BLK_SCSI_MAX_CMDS (256) +#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) + +struct blk_scsi_cmd_filter { + unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; + unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; + struct kobject kobj; +}; struct gendisk { int major; /* major number of driver */ @@ -120,6 +128,7 @@ struct gendisk { struct hd_struct **part; /* [indexed by minor] */ struct block_device_operations *fops; struct request_queue *queue; + struct blk_scsi_cmd_filter cmd_filter; void *private_data; sector_t capacity; -- cgit v1.2.3-70-g09d2 From b24498d477a14680fc3bb3ad884fa9fa76a2d237 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Jun 2008 09:12:09 +0200 Subject: block: integrity flags can't use bit ops on unsigned short Just use normal open coded bit operations instead, they need not be atomic. Signed-off-by: Jens Axboe --- block/blk-integrity.c | 17 +++++++---------- include/linux/blkdev.h | 8 ++++---- 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 4ffa3814f6a..3f1a8478cc3 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -217,17 +217,16 @@ static ssize_t integrity_read_store(struct blk_integrity *bi, unsigned long val = simple_strtoul(p, &p, 10); if (val) - set_bit(INTEGRITY_FLAG_READ, &bi->flags); + bi->flags |= INTEGRITY_FLAG_READ; else - clear_bit(INTEGRITY_FLAG_READ, &bi->flags); + bi->flags &= ~INTEGRITY_FLAG_READ; return count; } static ssize_t integrity_read_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", - test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0); + return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_READ) != 0); } static ssize_t integrity_write_store(struct blk_integrity *bi, @@ -237,17 +236,16 @@ static ssize_t integrity_write_store(struct blk_integrity *bi, unsigned long val = simple_strtoul(p, &p, 10); if (val) - set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->flags |= INTEGRITY_FLAG_WRITE; else - clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->flags &= ~INTEGRITY_FLAG_WRITE; return count; } static ssize_t integrity_write_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", - test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0); + return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_WRITE) != 0); } static struct integrity_sysfs_entry integrity_format_entry = { @@ -340,8 +338,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) kobject_uevent(&bi->kobj, KOBJ_ADD); - set_bit(INTEGRITY_FLAG_READ, &bi->flags); - set_bit(INTEGRITY_FLAG_WRITE, &bi->flags); + bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE; bi->sector_size = disk->queue->hardsect_size; disk->integrity = bi; } else diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a842b776d09..7ab8acad5b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -870,8 +870,8 @@ void kblockd_flush_work(struct work_struct *work); #if defined(CONFIG_BLK_DEV_INTEGRITY) -#define INTEGRITY_FLAG_READ 1 /* verify data integrity on read */ -#define INTEGRITY_FLAG_WRITE 2 /* generate data integrity on write */ +#define INTEGRITY_FLAG_READ 2 /* verify data integrity on read */ +#define INTEGRITY_FLAG_WRITE 4 /* generate data integrity on write */ struct blk_integrity_exchg { void *prot_buf; @@ -940,11 +940,11 @@ static inline int bdev_integrity_enabled(struct block_device *bdev, int rw) return 0; if (rw == READ && bi->verify_fn != NULL && - test_bit(INTEGRITY_FLAG_READ, &bi->flags)) + (bi->flags & INTEGRITY_FLAG_READ)) return 1; if (rw == WRITE && bi->generate_fn != NULL && - test_bit(INTEGRITY_FLAG_WRITE, &bi->flags)) + (bi->flags & INTEGRITY_FLAG_WRITE)) return 1; return 0; -- cgit v1.2.3-70-g09d2 From cc371e66e340f35eed8dc4651c7c18e754c7fb26 Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Thu, 3 Jul 2008 09:53:43 +0200 Subject: Add bvec_merge_data to handle stacked devices and ->merge_bvec() When devices are stacked, one device's merge_bvec_fn may need to perform the mapping and then call one or more functions for its underlying devices. The following bio fields are used: bio->bi_sector bio->bi_bdev bio->bi_size bio->bi_rw using bio_data_dir() This patch creates a new struct bvec_merge_data holding a copy of those fields to avoid having to change them directly in the struct bio when going down the stack only to have to change them back again on the way back up. (And then when the bio gets mapped for real, the whole exercise gets repeated, but that's a problem for another day...) Signed-off-by: Alasdair G Kergon Cc: Neil Brown Cc: Milan Broz Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 9 +++++---- drivers/md/linear.c | 10 ++++++---- drivers/md/raid0.c | 10 ++++++---- drivers/md/raid10.c | 15 ++++++++------- drivers/md/raid5.c | 10 ++++++---- fs/bio.c | 26 +++++++++++++++++++++----- include/linux/blkdev.h | 9 ++++++++- 7 files changed, 60 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 3ba1df93e9e..589850cff35 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2633,11 +2633,12 @@ end_io: -static int pkt_merge_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *bvec) +static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, + struct bio_vec *bvec) { struct pktcdvd_device *pd = q->queuedata; - sector_t zone = ZONE(bio->bi_sector, pd); - int used = ((bio->bi_sector - zone) << 9) + bio->bi_size; + sector_t zone = ZONE(bmd->bi_sector, pd); + int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size; int remaining = (pd->settings.size << 9) - used; int remaining2; @@ -2645,7 +2646,7 @@ static int pkt_merge_bvec(struct request_queue *q, struct bio *bio, struct bio_v * A bio <= PAGE_SIZE must be allowed. If it crosses a packet * boundary, pkt_make_request() will split the bio. */ - remaining2 = PAGE_SIZE - bio->bi_size; + remaining2 = PAGE_SIZE - bmd->bi_size; remaining = max(remaining, remaining2); BUG_ON(remaining < 0); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 10748240cb2..6a866d7c8ae 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -50,17 +50,19 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) /** * linear_mergeable_bvec -- tell bio layer if two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can take at this offset */ -static int linear_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int linear_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; dev_info_t *dev0; - unsigned long maxsectors, bio_sectors = bio->bi_size >> 9; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); dev0 = which_dev(mddev, sector); maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1)); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 914c04ddec7..bcbb82594a1 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -241,18 +241,20 @@ static int create_strip_zones (mddev_t *mddev) /** * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset */ -static int raid0_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid0_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a71277b640a..22bb2b1b886 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -439,26 +439,27 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) /** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue - * @bio: the buffer head that's been built up so far + * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset * If near_copies == raid_disk, there are no striping issues, * but in that case, the function isn't called at all. */ -static int raid10_mergeable_bvec(struct request_queue *q, struct bio *bio, - struct bio_vec *bio_vec) +static int raid10_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ - if (max <= bio_vec->bv_len && bio_sectors == 0) - return bio_vec->bv_len; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; else return max; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 54c8ee28fcc..9b00675dc64 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3319,15 +3319,17 @@ static int raid5_congested(void *data, int bits) /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid5_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; - if (bio_data_dir(bio) == WRITE) + if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; diff --git a/fs/bio.c b/fs/bio.c index 7761c84c703..88322b066ac 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -325,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { prev->bv_len += len; - if (q->merge_bvec_fn && - q->merge_bvec_fn(q, bio, prev) < len) { - prev->bv_len -= len; - return 0; + + if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size, + .bi_rw = bio->bi_rw, + }; + + if (q->merge_bvec_fn(q, &bvm, prev) < len) { + prev->bv_len -= len; + return 0; + } } goto done; @@ -369,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * queue to get further control */ if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size, + .bi_rw = bio->bi_rw, + }; + /* * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, bio, bvec) < len) { + if (q->merge_bvec_fn(q, &bvm, bvec) < len) { bvec->bv_page = NULL; bvec->bv_len = 0; bvec->bv_offset = 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7ab8acad5b6..ff9d0bdf2a1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -254,7 +254,14 @@ typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unplug_fn) (struct request_queue *); struct bio_vec; -typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *); +struct bvec_merge_data { + struct block_device *bi_bdev; + sector_t bi_sector; + unsigned bi_size; + unsigned long bi_rw; +}; +typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *, + struct bio_vec *); typedef void (prepare_flush_fn) (struct request_queue *, struct request *); typedef void (softirq_done_fn)(struct request *); typedef int (dma_drain_needed_fn)(struct request *); -- cgit v1.2.3-70-g09d2 From e48ec69005f02b70b7ecfde1bc39a599086d16ef Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Jul 2008 13:18:54 +0200 Subject: block: extend queue_flag bitops Add test_and_clear and test_and_set. Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ++++-------- include/linux/blkdev.h | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index e0fb0bcc0c1..dbc7f42b5d2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -205,8 +205,7 @@ void blk_plug_device(struct request_queue *q) if (blk_queue_stopped(q)) return; - if (!test_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) { - __set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags); + if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); } @@ -221,10 +220,9 @@ int blk_remove_plug(struct request_queue *q) { WARN_ON(!irqs_disabled()); - if (!test_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) + if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) return 0; - queue_flag_clear(QUEUE_FLAG_PLUGGED, q); del_timer(&q->unplug_timer); return 1; } @@ -328,8 +326,7 @@ void blk_start_queue(struct request_queue *q) * one level of recursion is ok and is much faster than kicking * the unplug handling */ - if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { - queue_flag_set(QUEUE_FLAG_REENTER, q); + if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { q->request_fn(q); queue_flag_clear(QUEUE_FLAG_REENTER, q); } else { @@ -394,8 +391,7 @@ void __blk_run_queue(struct request_queue *q) * handling reinvoke the handler shortly if we already got there. */ if (!elv_queue_empty(q)) { - if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { - queue_flag_set(QUEUE_FLAG_REENTER, q); + if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { q->request_fn(q); queue_flag_clear(QUEUE_FLAG_REENTER, q); } else { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ff9d0bdf2a1..e04c4ac8a7c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -428,6 +428,32 @@ static inline void queue_flag_set_unlocked(unsigned int flag, __set_bit(flag, &q->queue_flags); } +static inline int queue_flag_test_and_clear(unsigned int flag, + struct request_queue *q) +{ + WARN_ON_ONCE(!queue_is_locked(q)); + + if (test_bit(flag, &q->queue_flags)) { + __clear_bit(flag, &q->queue_flags); + return 1; + } + + return 0; +} + +static inline int queue_flag_test_and_set(unsigned int flag, + struct request_queue *q) +{ + WARN_ON_ONCE(!queue_is_locked(q)); + + if (!test_bit(flag, &q->queue_flags)) { + __set_bit(flag, &q->queue_flags); + return 0; + } + + return 1; +} + static inline void queue_flag_set(unsigned int flag, struct request_queue *q) { WARN_ON_ONCE(!queue_is_locked(q)); -- cgit v1.2.3-70-g09d2 From 42796d37da6ef4fd851dc6d5d0387baf7e2b0c3c Mon Sep 17 00:00:00 2001 From: eric miao Date: Mon, 14 Apr 2008 09:35:08 +0100 Subject: [ARM] pxa: add generic PWM backlight driver Patch mostly from Eric Miao, with minor edits by rmk to convert Eric's driver to a generic PWM-based backlight driver. Signed-off-by: eric miao Signed-off-by: Russell King --- drivers/video/backlight/Kconfig | 7 ++ drivers/video/backlight/Makefile | 1 + drivers/video/backlight/pwm_bl.c | 160 +++++++++++++++++++++++++++++++++++++++ include/linux/pwm_backlight.h | 14 ++++ 4 files changed, 182 insertions(+) create mode 100644 drivers/video/backlight/pwm_bl.c create mode 100644 include/linux/pwm_backlight.h (limited to 'include/linux') diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index dcd8073c236..30bf7f2f163 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -112,3 +112,10 @@ config BACKLIGHT_CARILLO_RANCH help If you have a Intel LE80578 (Carillo Ranch) say Y to enable the backlight driver. + +config BACKLIGHT_PWM + tristate "Generic PWM based Backlight Driver" + depends on BACKLIGHT_CLASS_DEVICE && HAVE_PWM + help + If you have a LCD backlight adjustable by PWM, say Y to enable + this driver. diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile index 33f6c7cecc7..b51a7cd1250 100644 --- a/drivers/video/backlight/Makefile +++ b/drivers/video/backlight/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_BACKLIGHT_LOCOMO) += locomolcd.o obj-$(CONFIG_BACKLIGHT_OMAP1) += omap1_bl.o obj-$(CONFIG_BACKLIGHT_PROGEAR) += progear_bl.o obj-$(CONFIG_BACKLIGHT_CARILLO_RANCH) += cr_bllcd.o +obj-$(CONFIG_BACKLIGHT_PWM) += pwm_bl.o diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c new file mode 100644 index 00000000000..9637f5e08cb --- /dev/null +++ b/drivers/video/backlight/pwm_bl.c @@ -0,0 +1,160 @@ +/* + * linux/drivers/video/backlight/pwm_bl.c + * + * simple PWM based backlight control, board code has to setup + * 1) pin configuration so PWM waveforms can output + * 2) platform_data casts to the PWM id (0/1/2/3 on PXA) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct pwm_bl_data { + struct pwm_device *pwm; + unsigned int period; +}; + +static int pwm_backlight_update_status(struct backlight_device *bl) +{ + struct pwm_bl_data *pb = dev_get_drvdata(&bl->dev); + int brightness = bl->props.brightness; + int max = bl->props.max_brightness; + + if (bl->props.power != FB_BLANK_UNBLANK) + brightness = 0; + + if (bl->props.fb_blank != FB_BLANK_UNBLANK) + brightness = 0; + + if (brightness == 0) { + pwm_config(pb->pwm, 0, pb->period); + pwm_disable(pb->pwm); + } else { + pwm_config(pb->pwm, brightness * pb->period / max, pb->period); + pwm_enable(pb->pwm); + } + return 0; +} + +static int pwm_backlight_get_brightness(struct backlight_device *bl) +{ + return bl->props.brightness; +} + +static struct backlight_ops pwm_backlight_ops = { + .update_status = pwm_backlight_update_status, + .get_brightness = pwm_backlight_get_brightness, +}; + +static int pwm_backlight_probe(struct platform_device *pdev) +{ + struct platform_pwm_backlight_data *data = pdev->dev.platform_data; + struct backlight_device *bl; + struct pwm_bl_data *pb; + + if (!data) + return -EINVAL; + + pb = kzalloc(sizeof(*pb), GFP_KERNEL); + if (!pb) + return -ENOMEM; + + pb->period = data->pwm_period_ns; + + pb->pwm = pwm_request(data->pwm_id, "backlight"); + if (pb->pwm == NULL) { + dev_err(&pdev->dev, "unable to request PWM for backlight\n"); + kfree(pb); + return -EBUSY; + } + + bl = backlight_device_register(pdev->name, &pdev->dev, + pb, &pwm_backlight_ops); + if (IS_ERR(bl)) { + dev_err(&pdev->dev, "failed to register backlight\n"); + pwm_free(pb->pwm); + kfree(pb); + return PTR_ERR(bl); + } + + bl->props.max_brightness = data->max_brightness; + bl->props.brightness = data->dft_brightness; + backlight_update_status(bl); + + platform_set_drvdata(pdev, bl); + return 0; +} + +static int pwm_backlight_remove(struct platform_device *pdev) +{ + struct backlight_device *bl = platform_get_drvdata(pdev); + struct pwm_bl_data *pb = dev_get_drvdata(&bl->dev); + + backlight_device_unregister(bl); + pwm_config(pb->pwm, 0, pb->period); + pwm_disable(pb->pwm); + pwm_free(pb->pwm); + kfree(pb); + return 0; +} + +#ifdef CONFIG_PM +static int pwm_backlight_suspend(struct platform_device *pdev, + pm_message_t state) +{ + struct backlight_device *bl = platform_get_drvdata(pdev); + struct pwm_bl_data *pb = dev_get_drvdata(&bl->dev); + + pwm_config(pb->pwm, 0, pb->period); + pwm_disable(pb->pwm); + return 0; +} + +static int pwm_backlight_resume(struct platform_device *pdev) +{ + struct backlight_device *bl = platform_get_drvdata(pdev); + + backlight_update_status(bl); + return 0; +} +#else +#define pwm_backlight_suspend NULL +#define pwm_backlight_resume NULL +#endif + +static struct platform_driver pwm_backlight_driver = { + .driver = { + .name = "pwm-backlight", + .owner = THIS_MODULE, + }, + .probe = pwm_backlight_probe, + .remove = pwm_backlight_remove, + .suspend = pwm_backlight_suspend, + .resume = pwm_backlight_resume, +}; + +static int __init pwm_backlight_init(void) +{ + return platform_driver_register(&pwm_backlight_driver); +} +module_init(pwm_backlight_init); + +static void __exit pwm_backlight_exit(void) +{ + platform_driver_unregister(&pwm_backlight_driver); +} +module_exit(pwm_backlight_exit); + +MODULE_DESCRIPTION("PWM based Backlight Driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/pwm_backlight.h b/include/linux/pwm_backlight.h new file mode 100644 index 00000000000..aeeffedbe82 --- /dev/null +++ b/include/linux/pwm_backlight.h @@ -0,0 +1,14 @@ +/* + * Generic PWM backlight driver data - see drivers/video/backlight/pwm_bl.c + */ +#ifndef __LINUX_PWM_BACKLIGHT_H +#define __LINUX_PWM_BACKLIGHT_H + +struct platform_pwm_backlight_data { + int pwm_id; + unsigned int max_brightness; + unsigned int dft_brightness; + unsigned int pwm_period_ns; +}; + +#endif -- cgit v1.2.3-70-g09d2 From 3b73125af69f93972625f4b655675f42ca4274eb Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Thu, 22 May 2008 14:18:40 +0100 Subject: [ARM] 5044/1: pwm_bl: add init/notify/exit callbacks This allows platform code to manipulate GPIOs and brightness level as needed. Signed-off-by: Philipp Zabel Signed-off-by: Russell King --- drivers/video/backlight/pwm_bl.c | 39 ++++++++++++++++++++++++++++++++------- include/linux/pwm_backlight.h | 3 +++ 2 files changed, 35 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 9637f5e08cb..8346dfc01cf 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -23,6 +23,7 @@ struct pwm_bl_data { struct pwm_device *pwm; unsigned int period; + int (*notify)(int brightness); }; static int pwm_backlight_update_status(struct backlight_device *bl) @@ -37,6 +38,9 @@ static int pwm_backlight_update_status(struct backlight_device *bl) if (bl->props.fb_blank != FB_BLANK_UNBLANK) brightness = 0; + if (pb->notify) + brightness = pb->notify(brightness); + if (brightness == 0) { pwm_config(pb->pwm, 0, pb->period); pwm_disable(pb->pwm); @@ -62,30 +66,39 @@ static int pwm_backlight_probe(struct platform_device *pdev) struct platform_pwm_backlight_data *data = pdev->dev.platform_data; struct backlight_device *bl; struct pwm_bl_data *pb; + int ret; if (!data) return -EINVAL; + if (data->init) { + ret = data->init(&pdev->dev); + if (ret < 0) + return ret; + } + pb = kzalloc(sizeof(*pb), GFP_KERNEL); - if (!pb) - return -ENOMEM; + if (!pb) { + ret = -ENOMEM; + goto err_alloc; + } pb->period = data->pwm_period_ns; + pb->notify = data->notify; pb->pwm = pwm_request(data->pwm_id, "backlight"); if (pb->pwm == NULL) { dev_err(&pdev->dev, "unable to request PWM for backlight\n"); - kfree(pb); - return -EBUSY; + ret = -EBUSY; + goto err_pwm; } bl = backlight_device_register(pdev->name, &pdev->dev, pb, &pwm_backlight_ops); if (IS_ERR(bl)) { dev_err(&pdev->dev, "failed to register backlight\n"); - pwm_free(pb->pwm); - kfree(pb); - return PTR_ERR(bl); + ret = PTR_ERR(bl); + goto err_bl; } bl->props.max_brightness = data->max_brightness; @@ -94,10 +107,20 @@ static int pwm_backlight_probe(struct platform_device *pdev) platform_set_drvdata(pdev, bl); return 0; + +err_bl: + pwm_free(pb->pwm); +err_pwm: + kfree(pb); +err_alloc: + if (data->exit) + data->exit(&pdev->dev); + return ret; } static int pwm_backlight_remove(struct platform_device *pdev) { + struct platform_pwm_backlight_data *data = pdev->dev.platform_data; struct backlight_device *bl = platform_get_drvdata(pdev); struct pwm_bl_data *pb = dev_get_drvdata(&bl->dev); @@ -106,6 +129,8 @@ static int pwm_backlight_remove(struct platform_device *pdev) pwm_disable(pb->pwm); pwm_free(pb->pwm); kfree(pb); + if (data->exit) + data->exit(&pdev->dev); return 0; } diff --git a/include/linux/pwm_backlight.h b/include/linux/pwm_backlight.h index aeeffedbe82..7a9754c9677 100644 --- a/include/linux/pwm_backlight.h +++ b/include/linux/pwm_backlight.h @@ -9,6 +9,9 @@ struct platform_pwm_backlight_data { unsigned int max_brightness; unsigned int dft_brightness; unsigned int pwm_period_ns; + int (*init)(struct device *dev); + int (*notify)(int brightness); + void (*exit)(struct device *dev); }; #endif -- cgit v1.2.3-70-g09d2 From 41d54d3bf83f62d3ff5948cb788fe6007e66a0d0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 3 Jul 2008 09:14:26 -0500 Subject: slub: Do not use 192 byte sized cache if minimum alignment is 128 byte The 192 byte cache is not necessary if we have a basic alignment of 128 byte. If it would be used then the 192 would be aligned to the next 128 byte boundary which would result in another 256 byte cache. Two 256 kmalloc caches cause sysfs to complain about a duplicate entry. MIPS needs 128 byte aligned kmalloc caches and spits out warnings on boot without this patch. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 2 ++ mm/slub.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 71e43a12ebb..cef6f8fddd7 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -137,10 +137,12 @@ static __always_inline int kmalloc_index(size_t size) if (size <= KMALLOC_MIN_SIZE) return KMALLOC_SHIFT_LOW; +#if KMALLOC_MIN_SIZE <= 64 if (size > 64 && size <= 96) return 1; if (size > 128 && size <= 192) return 2; +#endif if (size <= 8) return 3; if (size <= 16) return 4; if (size <= 32) return 5; diff --git a/mm/slub.c b/mm/slub.c index 0987d1cd943..2c9a62d1f42 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2995,8 +2995,6 @@ void __init kmem_cache_init(void) create_kmalloc_cache(&kmalloc_caches[1], "kmalloc-96", 96, GFP_KERNEL); caches++; - } - if (KMALLOC_MIN_SIZE <= 128) { create_kmalloc_cache(&kmalloc_caches[2], "kmalloc-192", 192, GFP_KERNEL); caches++; @@ -3026,6 +3024,16 @@ void __init kmem_cache_init(void) for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; + if (KMALLOC_MIN_SIZE == 128) { + /* + * The 192 byte sized cache is not used if the alignment + * is 128 byte. Redirect kmalloc to use the 256 byte cache + * instead. + */ + for (i = 128 + 8; i <= 192; i += 8) + size_index[(i - 1) / 8] = 8; + } + slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ -- cgit v1.2.3-70-g09d2 From 27f8221af406e43b529a5425bc99c9b1e9bdf521 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 4 Jul 2008 09:30:03 +0200 Subject: block: add blk_queue_update_dma_pad This adds blk_queue_update_dma_pad to prevent LLDs from overwriting the dma pad mask wrongly (we added blk_queue_update_dma_alignment due to the same reason). This also converts libata to use blk_queue_update_dma_pad instead of blk_queue_dma_pad. Signed-off-by: FUJITA Tomonori Cc: Tejun Heo Cc: Bartlomiej Zolnierkiewicz Cc: Thomas Bogendoerfer Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/blk-settings.c | 24 ++++++++++++++++++++---- drivers/ata/libata-scsi.c | 3 ++- include/linux/blkdev.h | 1 + 3 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/blk-settings.c b/block/blk-settings.c index 8dd86418f35..dfc77012843 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -302,11 +302,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits); * @q: the request queue for the device * @mask: pad mask * - * Set pad mask. Direct IO requests are padded to the mask specified. + * Set dma pad mask. * - * Appending pad buffer to a request modifies ->data_len such that it - * includes the pad buffer. The original requested data length can be - * obtained using blk_rq_raw_data_len(). + * Appending pad buffer to a request modifies the last entry of a + * scatter list such that it includes the pad buffer. **/ void blk_queue_dma_pad(struct request_queue *q, unsigned int mask) { @@ -314,6 +313,23 @@ void blk_queue_dma_pad(struct request_queue *q, unsigned int mask) } EXPORT_SYMBOL(blk_queue_dma_pad); +/** + * blk_queue_update_dma_pad - update pad mask + * @q: the request queue for the device + * @mask: pad mask + * + * Update dma pad mask. + * + * Appending pad buffer to a request modifies the last entry of a + * scatter list such that it includes the pad buffer. + **/ +void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) +{ + if (mask > q->dma_pad_mask) + q->dma_pad_mask = mask; +} +EXPORT_SYMBOL(blk_queue_update_dma_pad); + /** * blk_queue_dma_drain - Set up a drain buffer for excess dma. * @q: the request queue for the device diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 57a43649a46..499ccc628d8 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -885,7 +885,8 @@ static int ata_scsi_dev_config(struct scsi_device *sdev, /* set the min alignment and padding */ blk_queue_update_dma_alignment(sdev->request_queue, ATA_DMA_PAD_SZ - 1); - blk_queue_dma_pad(sdev->request_queue, ATA_DMA_PAD_SZ - 1); + blk_queue_update_dma_pad(sdev->request_queue, + ATA_DMA_PAD_SZ - 1); /* configure draining */ buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e04c4ac8a7c..1ffd8bfdc4c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -776,6 +776,7 @@ extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_hardsect_size(struct request_queue *, unsigned short); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); extern void blk_queue_dma_pad(struct request_queue *, unsigned int); +extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, void *buf, unsigned int size); -- cgit v1.2.3-70-g09d2 From cde53535991fbb5c34a1566f25955297c1487b8d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 4 Jul 2008 09:59:22 -0700 Subject: Christoph has moved Remove all clameter@sgi.com addresses from the kernel tree since they will become invalid on June 27th. Change my maintainer email address for the slab allocators to cl@linux-foundation.org (which will be the new email address for the future). Signed-off-by: Christoph Lameter Signed-off-by: Christoph Lameter Cc: Pekka Enberg Cc: Stephen Rothwell Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/slabinfo.c | 4 ++-- Documentation/vm/slub.txt | 2 +- MAINTAINERS | 2 +- include/asm-generic/atomic.h | 2 +- include/linux/slab.h | 2 +- include/linux/slub_def.h | 2 +- kernel/workqueue.c | 2 +- lib/radix-tree.c | 2 +- mm/allocpercpu.c | 2 +- mm/migrate.c | 2 +- mm/slub.c | 2 +- mm/sparse-vmemmap.c | 2 +- 12 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c index e4230ed16ee..df3227605d5 100644 --- a/Documentation/vm/slabinfo.c +++ b/Documentation/vm/slabinfo.c @@ -1,7 +1,7 @@ /* * Slabinfo: Tool to get reports about slabs * - * (C) 2007 sgi, Christoph Lameter + * (C) 2007 sgi, Christoph Lameter * * Compile by: * @@ -99,7 +99,7 @@ void fatal(const char *x, ...) void usage(void) { - printf("slabinfo 5/7/2007. (c) 2007 sgi. clameter@sgi.com\n\n" + printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n" "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" "-a|--aliases Show aliases\n" "-A|--activity Most active slabs first\n" diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index 7c13f22a0c9..bb1f5c6e28b 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt @@ -266,4 +266,4 @@ of other objects. slub_debug=FZ,dentry -Christoph Lameter, , May 30, 2007 +Christoph Lameter, May 30, 2007 diff --git a/MAINTAINERS b/MAINTAINERS index 460e699fd28..13b7b19692e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3672,7 +3672,7 @@ S: Maintained SLAB ALLOCATOR P: Christoph Lameter -M: clameter@sgi.com +M: cl@linux-foundation.org P: Pekka Enberg M: penberg@cs.helsinki.fi P: Matt Mackall diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 85fd0aa27a8..4ec0a296bde 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -2,7 +2,7 @@ #define _ASM_GENERIC_ATOMIC_H /* * Copyright (C) 2005 Silicon Graphics, Inc. - * Christoph Lameter + * Christoph Lameter * * Allows to provide arch independent atomic definitions without the need to * edit all arch specific atomic.h files. diff --git a/include/linux/slab.h b/include/linux/slab.h index c2ad3501659..9aa90a6f20e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1,7 +1,7 @@ /* * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk). * - * (C) SGI 2006, Christoph Lameter + * (C) SGI 2006, Christoph Lameter * Cleaned up and restructured to ease the addition of alternative * implementations of SLAB allocators. */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index cef6f8fddd7..d117ea2825a 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -4,7 +4,7 @@ /* * SLUB : A Slab allocator without object queues. * - * (C) 2007 SGI, Christoph Lameter + * (C) 2007 SGI, Christoph Lameter */ #include #include diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 29fc39f1029..ce7799540c9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -13,7 +13,7 @@ * Kai Petzke * Theodore Ts'o * - * Made to use alloc_percpu by Christoph Lameter . + * Made to use alloc_percpu by Christoph Lameter. */ #include diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 169a2f8dabc..56ec21a7f73 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1,7 +1,7 @@ /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig - * Copyright (C) 2005 SGI, Christoph Lameter + * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin * * This program is free software; you can redistribute it and/or diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index f4026bae6ee..05f2b4009cc 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -1,7 +1,7 @@ /* * linux/mm/allocpercpu.c * - * Separated from slab.c August 11, 2006 Christoph Lameter + * Separated from slab.c August 11, 2006 Christoph Lameter */ #include #include diff --git a/mm/migrate.c b/mm/migrate.c index 112bcaeaa10..55bd355d170 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -9,7 +9,7 @@ * IWAMOTO Toshihiro * Hirokazu Takahashi * Dave Hansen - * Christoph Lameter + * Christoph Lameter */ #include diff --git a/mm/slub.c b/mm/slub.c index 2c9a62d1f42..1a427c0ae83 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5,7 +5,7 @@ * The allocator synchronizes using per slab locks and only * uses a centralized lock to manage a pool of partial slabs. * - * (C) 2007 SGI, Christoph Lameter + * (C) 2007 SGI, Christoph Lameter */ #include diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 99c4f36eb8a..a91b5f8fcaf 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -1,7 +1,7 @@ /* * Virtual Memory Map support * - * (C) 2007 sgi. Christoph Lameter . + * (C) 2007 sgi. Christoph Lameter. * * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, * virt_to_page, page_address() to be implemented as a base offset -- cgit v1.2.3-70-g09d2 From 69d44a1835ec8163a82c4ee57367f87ae0f85c2e Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Fri, 4 Jul 2008 09:59:27 -0700 Subject: firmware: fix the request_firmware() dummy > the build (.config attached) failed, make ends with : > ... > UPD include/linux/compile.h > CC init/version.o > LD init/built-in.o > LD vmlinux > drivers/built-in.o: In function `sas_request_addr': > (.text+0x33bab): undefined reference to `request_firmware' > drivers/built-in.o: In function `sas_request_addr': > (.text+0x33c3f): undefined reference to `release_firmware' > make: *** [vmlinux] Error 1 There's a slight fault in the stub logic. It fails for FW_LOADER=m and the user =y. This should fix it. This patch fixes the following 2.6.26-rc regression: http://bugzilla.kernel.org/show_bug.cgi?id=10730 Reviewed-by: Toralf Foerster Signed-off-by: Adrian Bunk Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/firmware.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firmware.h b/include/linux/firmware.h index 4d10c7328d2..6c7eff2ebad 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -13,7 +13,7 @@ struct firmware { struct device; -#if defined(CONFIG_FW_LOADER) || defined(CONFIG_FW_LOADER_MODULE) +#if defined(CONFIG_FW_LOADER) || (defined(CONFIG_FW_LOADER_MODULE) && defined(MODULE)) int request_firmware(const struct firmware **fw, const char *name, struct device *device); int request_firmware_nowait( -- cgit v1.2.3-70-g09d2 From 450c622e9ff19888818d4e2c4d31adb97a5242b2 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 4 Jul 2008 09:59:33 -0700 Subject: Miguel Ojeda has moved Signed-off-by: Miguel Ojeda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- CREDITS | 5 +++-- Documentation/auxdisplay/cfag12864b | 4 ++-- Documentation/auxdisplay/cfag12864b-example.c | 2 +- Documentation/auxdisplay/ks0108 | 4 ++-- MAINTAINERS | 20 ++++++++++++-------- drivers/auxdisplay/Kconfig | 2 +- drivers/auxdisplay/cfag12864b.c | 4 ++-- drivers/auxdisplay/cfag12864bfb.c | 4 ++-- drivers/auxdisplay/ks0108.c | 4 ++-- include/linux/cfag12864b.h | 2 +- include/linux/ks0108.h | 2 +- 11 files changed, 29 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/CREDITS b/CREDITS index 8fec7b3f96d..e97bea06b59 100644 --- a/CREDITS +++ b/CREDITS @@ -2611,8 +2611,9 @@ S: Perth, Western Australia S: Australia N: Miguel Ojeda Sandonis -E: maxextreme@gmail.com -W: http://maxextreme.googlepages.com/ +E: miguel.ojeda.sandonis@gmail.com +W: http://miguelojeda.es +W: http://jair.lab.fi.uva.es/~migojed/ D: Author of the ks0108, cfag12864b and cfag12864bfb auxiliary display drivers. D: Maintainer of the auxiliary display drivers tree (drivers/auxdisplay/*) S: C/ Mieses 20, 9-B diff --git a/Documentation/auxdisplay/cfag12864b b/Documentation/auxdisplay/cfag12864b index b714183d412..eb7be393a51 100644 --- a/Documentation/auxdisplay/cfag12864b +++ b/Documentation/auxdisplay/cfag12864b @@ -3,7 +3,7 @@ =================================== License: GPLv2 -Author & Maintainer: Miguel Ojeda Sandonis +Author & Maintainer: Miguel Ojeda Sandonis Date: 2006-10-27 @@ -22,7 +22,7 @@ Date: 2006-10-27 1. DRIVER INFORMATION --------------------- -This driver support one cfag12864b display at time. +This driver supports a cfag12864b LCD. --------------------- diff --git a/Documentation/auxdisplay/cfag12864b-example.c b/Documentation/auxdisplay/cfag12864b-example.c index 7bfac354d4c..2caeea5e499 100644 --- a/Documentation/auxdisplay/cfag12864b-example.c +++ b/Documentation/auxdisplay/cfag12864b-example.c @@ -4,7 +4,7 @@ * Description: cfag12864b LCD userspace example program * License: GPLv2 * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-31 * * This program is free software; you can redistribute it and/or modify diff --git a/Documentation/auxdisplay/ks0108 b/Documentation/auxdisplay/ks0108 index 92b03b60c61..8ddda0c8cee 100644 --- a/Documentation/auxdisplay/ks0108 +++ b/Documentation/auxdisplay/ks0108 @@ -3,7 +3,7 @@ ========================================== License: GPLv2 -Author & Maintainer: Miguel Ojeda Sandonis +Author & Maintainer: Miguel Ojeda Sandonis Date: 2006-10-27 @@ -21,7 +21,7 @@ Date: 2006-10-27 1. DRIVER INFORMATION --------------------- -This driver support the ks0108 LCD controller. +This driver supports the ks0108 LCD controller. --------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 13b7b19692e..ba7ac13aba9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -763,9 +763,10 @@ S: Maintained AUXILIARY DISPLAY DRIVERS P: Miguel Ojeda Sandonis -M: maxextreme@gmail.com +M: miguel.ojeda.sandonis@gmail.com L: linux-kernel@vger.kernel.org -W: http://auxdisplay.googlepages.com/ +W: http://miguelojeda.es/auxdisplay.htm +W: http://jair.lab.fi.uva.es/~migojed/auxdisplay.htm S: Maintained AVR32 ARCHITECTURE @@ -1055,16 +1056,18 @@ S: Supported CFAG12864B LCD DRIVER P: Miguel Ojeda Sandonis -M: maxextreme@gmail.com +M: miguel.ojeda.sandonis@gmail.com L: linux-kernel@vger.kernel.org -W: http://auxdisplay.googlepages.com/ +W: http://miguelojeda.es/auxdisplay.htm +W: http://jair.lab.fi.uva.es/~migojed/auxdisplay.htm S: Maintained CFAG12864BFB LCD FRAMEBUFFER DRIVER P: Miguel Ojeda Sandonis -M: maxextreme@gmail.com +M: miguel.ojeda.sandonis@gmail.com L: linux-kernel@vger.kernel.org -W: http://auxdisplay.googlepages.com/ +W: http://miguelojeda.es/auxdisplay.htm +W: http://jair.lab.fi.uva.es/~migojed/auxdisplay.htm S: Maintained CFG80211 and NL80211 @@ -2428,9 +2431,10 @@ S: Maintained KS0108 LCD CONTROLLER DRIVER P: Miguel Ojeda Sandonis -M: maxextreme@gmail.com +M: miguel.ojeda.sandonis@gmail.com L: linux-kernel@vger.kernel.org -W: http://auxdisplay.googlepages.com/ +W: http://miguelojeda.es/auxdisplay.htm +W: http://jair.lab.fi.uva.es/~migojed/auxdisplay.htm S: Maintained LAPB module diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index 043353bd060..14b9d5f4c20 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -64,7 +64,7 @@ config KS0108_DELAY Amount of time the ks0108 should wait between each control write to the parallel port. - If your driver seems to miss random writings, increment this. + If your LCD seems to miss random writings, increment this. If you don't know what I'm talking about, ignore it. diff --git a/drivers/auxdisplay/cfag12864b.c b/drivers/auxdisplay/cfag12864b.c index 80bb0610538..683509f013a 100644 --- a/drivers/auxdisplay/cfag12864b.c +++ b/drivers/auxdisplay/cfag12864b.c @@ -5,7 +5,7 @@ * License: GPLv2 * Depends: ks0108 * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-31 * * This program is free software; you can redistribute it and/or modify @@ -398,5 +398,5 @@ module_init(cfag12864b_init); module_exit(cfag12864b_exit); MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Miguel Ojeda Sandonis "); +MODULE_AUTHOR("Miguel Ojeda Sandonis "); MODULE_DESCRIPTION("cfag12864b LCD driver"); diff --git a/drivers/auxdisplay/cfag12864bfb.c b/drivers/auxdisplay/cfag12864bfb.c index 307c190699e..fe3a865be4e 100644 --- a/drivers/auxdisplay/cfag12864bfb.c +++ b/drivers/auxdisplay/cfag12864bfb.c @@ -5,7 +5,7 @@ * License: GPLv2 * Depends: cfag12864b * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-31 * * This program is free software; you can redistribute it and/or modify @@ -186,5 +186,5 @@ module_init(cfag12864bfb_init); module_exit(cfag12864bfb_exit); MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Miguel Ojeda Sandonis "); +MODULE_AUTHOR("Miguel Ojeda Sandonis "); MODULE_DESCRIPTION("cfag12864b LCD framebuffer driver"); diff --git a/drivers/auxdisplay/ks0108.c b/drivers/auxdisplay/ks0108.c index e6c3646ef18..5b93852392b 100644 --- a/drivers/auxdisplay/ks0108.c +++ b/drivers/auxdisplay/ks0108.c @@ -5,7 +5,7 @@ * License: GPLv2 * Depends: parport * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-31 * * This program is free software; you can redistribute it and/or modify @@ -173,6 +173,6 @@ module_init(ks0108_init); module_exit(ks0108_exit); MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Miguel Ojeda Sandonis "); +MODULE_AUTHOR("Miguel Ojeda Sandonis "); MODULE_DESCRIPTION("ks0108 LCD Controller driver"); diff --git a/include/linux/cfag12864b.h b/include/linux/cfag12864b.h index 1605dd8aa64..6f9f19d6659 100644 --- a/include/linux/cfag12864b.h +++ b/include/linux/cfag12864b.h @@ -4,7 +4,7 @@ * Description: cfag12864b LCD driver header * License: GPLv2 * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-12 * * This program is free software; you can redistribute it and/or modify diff --git a/include/linux/ks0108.h b/include/linux/ks0108.h index a2c54acceb4..cb311798e0b 100644 --- a/include/linux/ks0108.h +++ b/include/linux/ks0108.h @@ -4,7 +4,7 @@ * Description: ks0108 LCD Controller driver header * License: GPLv2 * - * Author: Copyright (C) Miguel Ojeda Sandonis + * Author: Copyright (C) Miguel Ojeda Sandonis * Date: 2006-10-31 * * This program is free software; you can redistribute it and/or modify -- cgit v1.2.3-70-g09d2 From 93921f5c2ce7427cc30341c86882527d1d1d8770 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 4 Jul 2008 09:59:48 -0700 Subject: Introduce rculist.h In linux-next there is a commit ("rcu: split list.h and move rcu-protected lists into rculist.h") that moved the rcu related list iterators from list.h to rculist.h. Add a trivial version of the file now so that various subsystem trees can start using it now for -next changes and so reduce the build errors caused by adding uses of the moved functions. Cc: Franck Bui-Huu Acked-by: Paul E. McKenney Cc: Josh Triplett Acked-by: Ingo Molnar Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rculist.h | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 include/linux/rculist.h (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h new file mode 100644 index 00000000000..bde4586f438 --- /dev/null +++ b/include/linux/rculist.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_RCULIST_H +#define _LINUX_RCULIST_H + +#include + +#endif /* _LINUX_RCULIST_H */ -- cgit v1.2.3-70-g09d2 From 086f7316f0d400806d76323beefae996bb3849b1 Mon Sep 17 00:00:00 2001 From: "Andrew G. Morgan" Date: Fri, 4 Jul 2008 09:59:58 -0700 Subject: security: filesystem capabilities: fix fragile setuid fixup code This commit includes a bugfix for the fragile setuid fixup code in the case that filesystem capabilities are supported (in access()). The effect of this fix is gated on filesystem capability support because changing securebits is only supported when filesystem capabilities support is configured.) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrew G. Morgan Acked-by: Serge Hallyn Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 37 ++++++++++++++++++++++--------------- include/linux/capability.h | 2 ++ include/linux/securebits.h | 15 ++++++++------- kernel/capability.c | 21 +++++++++++++++++++++ 4 files changed, 53 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/fs/open.c b/fs/open.c index a1450086e92..a99ad09c319 100644 --- a/fs/open.c +++ b/fs/open.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -425,7 +426,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) { struct nameidata nd; int old_fsuid, old_fsgid; - kernel_cap_t old_cap; + kernel_cap_t uninitialized_var(old_cap); /* !SECURE_NO_SETUID_FIXUP */ int res; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ @@ -433,23 +434,27 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) old_fsuid = current->fsuid; old_fsgid = current->fsgid; - old_cap = current->cap_effective; current->fsuid = current->uid; current->fsgid = current->gid; - /* - * Clear the capabilities if we switch to a non-root user - * - * FIXME: There is a race here against sys_capset. The - * capabilities can change yet we will restore the old - * value below. We should hold task_capabilities_lock, - * but we cannot because user_path_walk can sleep. - */ - if (current->uid) - cap_clear(current->cap_effective); - else - current->cap_effective = current->cap_permitted; + if (!issecure(SECURE_NO_SETUID_FIXUP)) { + /* + * Clear the capabilities if we switch to a non-root user + */ +#ifndef CONFIG_SECURITY_FILE_CAPABILITIES + /* + * FIXME: There is a race here against sys_capset. The + * capabilities can change yet we will restore the old + * value below. We should hold task_capabilities_lock, + * but we cannot because user_path_walk can sleep. + */ +#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */ + if (current->uid) + old_cap = cap_set_effective(__cap_empty_set); + else + old_cap = cap_set_effective(current->cap_permitted); + } res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); if (res) @@ -478,7 +483,9 @@ out_path_release: out: current->fsuid = old_fsuid; current->fsgid = old_fsgid; - current->cap_effective = old_cap; + + if (!issecure(SECURE_NO_SETUID_FIXUP)) + cap_set_effective(old_cap); return res; } diff --git a/include/linux/capability.h b/include/linux/capability.h index fa830f8de03..02673846d20 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -501,6 +501,8 @@ extern const kernel_cap_t __cap_empty_set; extern const kernel_cap_t __cap_full_set; extern const kernel_cap_t __cap_init_eff_set; +kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); + int capable(int cap); int __capable(struct task_struct *t, int cap); diff --git a/include/linux/securebits.h b/include/linux/securebits.h index c1f19dbceb0..92f09bdf117 100644 --- a/include/linux/securebits.h +++ b/include/linux/securebits.h @@ -7,14 +7,15 @@ inheritance of root-permissions and suid-root executable under compatibility mode. We raise the effective and inheritable bitmasks *of the executable file* if the effective uid of the new process is - 0. If the real uid is 0, we raise the inheritable bitmask of the + 0. If the real uid is 0, we raise the effective (legacy) bit of the executable file. */ #define SECURE_NOROOT 0 #define SECURE_NOROOT_LOCKED 1 /* make bit-0 immutable */ -/* When set, setuid to/from uid 0 does not trigger capability-"fixes" - to be compatible with old programs relying on set*uid to loose - privileges. When unset, setuid doesn't change privileges. */ +/* When set, setuid to/from uid 0 does not trigger capability-"fixup". + When unset, to provide compatiblility with old programs relying on + set*uid to gain/lose privilege, transitions to/from uid 0 cause + capabilities to be gained/lost. */ #define SECURE_NO_SETUID_FIXUP 2 #define SECURE_NO_SETUID_FIXUP_LOCKED 3 /* make bit-2 immutable */ @@ -26,10 +27,10 @@ #define SECURE_KEEP_CAPS 4 #define SECURE_KEEP_CAPS_LOCKED 5 /* make bit-4 immutable */ -/* Each securesetting is implemented using two bits. One bit specify +/* Each securesetting is implemented using two bits. One bit specifies whether the setting is on or off. The other bit specify whether the - setting is fixed or not. A setting which is fixed cannot be changed - from user-level. */ + setting is locked or not. A setting which is locked cannot be + changed from user-level. */ #define issecure_mask(X) (1 << (X)) #define issecure(X) (issecure_mask(X) & current->securebits) diff --git a/kernel/capability.c b/kernel/capability.c index cfbe4429948..901e0fdc3ff 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -121,6 +121,27 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) * uninteresting and/or not to be changed. */ +/* + * Atomically modify the effective capabilities returning the original + * value. No permission check is performed here - it is assumed that the + * caller is permitted to set the desired effective capabilities. + */ +kernel_cap_t cap_set_effective(const kernel_cap_t pE_new) +{ + kernel_cap_t pE_old; + + spin_lock(&task_capability_lock); + + pE_old = current->cap_effective; + current->cap_effective = pE_new; + + spin_unlock(&task_capability_lock); + + return pE_old; +} + +EXPORT_SYMBOL(cap_set_effective); + /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and -- cgit v1.2.3-70-g09d2 From e08c1694d9e2138204f2b79b73f0f159074ce2f5 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Fri, 4 Jul 2008 10:00:03 -0700 Subject: olpc: sdhci: add quirk for the Marvell CaFe's vdd/powerup issue This has been sitting around unloved for way too long.. The Marvell CaFe chip's SD implementation chokes during card insertion if one attempts to set the voltage and power up in the same SDHCI_POWER_CONTROL register write. This adds a quirk that does that particular dance in two steps. It also adds an entry to pci_ids.h for the CaFe chip's SD device. Signed-off-by: Andres Salomon Cc: Pierre Ossman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mmc/host/sdhci.c | 18 ++++++++++++++++++ include/linux/pci_ids.h | 1 + 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 07c2048b230..5b74c8cf440 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -55,6 +55,8 @@ static unsigned int debug_quirks = 0; #define SDHCI_QUIRK_32BIT_DMA_SIZE (1<<7) /* Controller needs to be reset after each request to stay stable */ #define SDHCI_QUIRK_RESET_AFTER_REQUEST (1<<8) +/* Controller needs voltage and power writes to happen separately */ +#define SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER (1<<9) static const struct pci_device_id pci_ids[] __devinitdata = { { @@ -127,6 +129,14 @@ static const struct pci_device_id pci_ids[] __devinitdata = { SDHCI_QUIRK_RESET_CMD_DATA_ON_IOS, }, + { + .vendor = PCI_VENDOR_ID_MARVELL, + .device = PCI_DEVICE_ID_MARVELL_CAFE_SD, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .driver_data = SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER, + }, + { .vendor = PCI_VENDOR_ID_JMICRON, .device = PCI_DEVICE_ID_JMICRON_JMB38X_SD, @@ -774,6 +784,14 @@ static void sdhci_set_power(struct sdhci_host *host, unsigned short power) BUG(); } + /* + * At least the CaFe chip gets confused if we set the voltage + * and set turn on power at the same time, so set the voltage first. + */ + if ((host->chip->quirks & SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER)) + writeb(pwr & ~SDHCI_POWER_ON, + host->ioaddr + SDHCI_POWER_CONTROL); + writeb(pwr, host->ioaddr + SDHCI_POWER_CONTROL); out: diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index eafc9d6d2b3..65953822c9c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1520,6 +1520,7 @@ #define PCI_DEVICE_ID_MARVELL_GT64260 0x6430 #define PCI_DEVICE_ID_MARVELL_MV64360 0x6460 #define PCI_DEVICE_ID_MARVELL_MV64460 0x6480 +#define PCI_DEVICE_ID_MARVELL_CAFE_SD 0x4101 #define PCI_VENDOR_ID_V3 0x11b0 #define PCI_DEVICE_ID_V3_V960 0x0001 -- cgit v1.2.3-70-g09d2 From acb7669c125676e63cf96582455509216c39745e Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 4 Jul 2008 10:00:05 -0700 Subject: cpumask: introduce new APIs In linux-next there is a commit ("x86: Add performance variants of cpumask operators") which, as part of the 4096 cpu support work adds some new APIs for dealing with cpu masks. Add trivial versions of these now so that subsystems can update in a timely manner and avoid conflicts in linux-next and the next merge window. Cc: Mike Travis Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 5df3db58fcc..c24875bd9c5 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -353,6 +353,10 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp, for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #endif /* NR_CPUS */ +#define next_cpu_nr(n, src) next_cpu(n, src) +#define cpus_weight_nr(cpumask) cpus_weight(cpumask) +#define for_each_cpu_mask_nr(cpu, mask) for_each_cpu_mask(cpu, mask) + /* * The following particular system cpumasks and operations manage * possible, present and online cpus. Each of them is a fixed size -- cgit v1.2.3-70-g09d2 From ca31e146d5c2fe51498e619eb3a64782d02e310a Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Sat, 5 Jul 2008 12:14:23 +0300 Subject: Move _RET_IP_ and _THIS_IP_ to include/linux/kernel.h These two macros are useful beyond lock debugging. Moved definitions from include/linux/debug_locks.h to include/linux/kernel.h, so code that needs them does not have to include the former, which would have been a less intuitive choice of a header. Signed-off-by: Eduard - Gabriel Munteanu Acked-by: Pekka Enberg Signed-off-by: Linus Torvalds --- include/linux/debug_locks.h | 10 ++-------- include/linux/kernel.h | 3 +++ 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index f4a5871767f..4aaa4afb1cb 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -1,6 +1,8 @@ #ifndef __LINUX_DEBUG_LOCKING_H #define __LINUX_DEBUG_LOCKING_H +#include + struct task_struct; extern int debug_locks; @@ -11,14 +13,6 @@ extern int debug_locks_silent; */ extern int debug_locks_off(void); -/* - * In the debug case we carry the caller's instruction pointer into - * other functions, but we dont want the function argument overhead - * in the nondebug case - hence these macros: - */ -#define _RET_IP_ (unsigned long)__builtin_return_address(0) -#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) - #define DEBUG_LOCKS_WARN_ON(c) \ ({ \ int __ret = 0; \ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 792bf0aa779..2e70006c7fa 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -46,6 +46,9 @@ extern const char linux_proc_banner[]; #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#define _RET_IP_ (unsigned long)__builtin_return_address(0) +#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) + #ifdef CONFIG_LBD # include # define sector_div(a, b) do_div(a, b) -- cgit v1.2.3-70-g09d2 From d2dbf343329dc777d77488743465f7be4245971d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Jun 2008 02:00:56 -0700 Subject: x86: clean up reserve_bootmem_generic() and port it to 32-bit 1. add reserve_bootmem_generic for 32bit 2. change len to unsigned long 3. make early_res_to_bootmem to use it Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 2 +- arch/x86/kernel/mpparse.c | 18 ++++++------------ arch/x86/mm/init_32.c | 6 ++++++ arch/x86/mm/init_64.c | 3 ++- include/asm-x86/proto.h | 2 -- include/linux/bootmem.h | 2 ++ 6 files changed, 17 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4f2cd5d179e..774063f11be 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -635,7 +635,7 @@ void __init early_res_to_bootmem(u64 start, u64 end) continue; printk(KERN_INFO " early res: %d [%llx-%llx] %s\n", i, final_start, final_end - 1, r->name); - reserve_bootmem(final_start, final_end - final_start, + reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT); } } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 7ac1b689b70..b62ac6ba141 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -859,10 +859,11 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, if (!reserve) return 1; -#ifdef CONFIG_X86_32 - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, BOOTMEM_DEFAULT); if (mpf->mpf_physptr) { + unsigned long size = PAGE_SIZE; +#ifdef CONFIG_X86_32 /* * We cannot access to MPC table to compute * table size yet, as only few megabytes from @@ -872,22 +873,15 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, * PAGE_SIZE from mpg->mpf_physptr yields BUG() * in reserve_bootmem. */ - unsigned long size = PAGE_SIZE; unsigned long end = max_low_pfn * PAGE_SIZE; if (mpf->mpf_physptr + size > end) size = end - mpf->mpf_physptr; - reserve_bootmem(mpf->mpf_physptr, size, +#endif + reserve_bootmem_generic(mpf->mpf_physptr, size, BOOTMEM_DEFAULT); } -#else - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, - BOOTMEM_DEFAULT); - if (mpf->mpf_physptr) - reserve_bootmem_generic(mpf->mpf_physptr, - PAGE_SIZE, BOOTMEM_DEFAULT); -#endif - return 1; + return 1; } bp += 4; length -= 16; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0e7bb5e8167..abadb1da70d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -785,3 +785,9 @@ void free_initrd_mem(unsigned long start, unsigned long end) free_init_pages("initrd memory", start, end); } #endif + +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) +{ + return reserve_bootmem(phys, len, flags); +} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bf7bf1de6c2..b8c2c1ef7ad 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -799,7 +799,8 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags) +int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) { #ifdef CONFIG_NUMA int nid, next_nid; diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h index a9f51472521..3dd458c385c 100644 --- a/include/asm-x86/proto.h +++ b/include/asm-x86/proto.h @@ -14,8 +14,6 @@ extern void ia32_syscall(void); extern void ia32_cstar_target(void); extern void ia32_sysenter_target(void); -extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags); - extern void syscall32_cpu_init(void); extern void check_efer(void); diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 686895bacd9..a1d9b79078e 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -84,6 +84,8 @@ extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags); __alloc_bootmem_low(x, PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ +extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, + int flags); extern unsigned long free_all_bootmem(void); extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); extern void *__alloc_bootmem_node(pg_data_t *pgdat, -- cgit v1.2.3-70-g09d2 From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Jun 2008 19:08:52 -0700 Subject: x86: replace shrink_active_range() with remove_active_range() in case we have kva before ramdisk on a node, we still need to use those ranges. v2: reserve_early kva ram area, in case there are holes in highmem, to avoid those area could be treat as free high pages. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 45 ++++++++++++++++++++++++--------------------- include/linux/mm.h | 3 ++- mm/page_alloc.c | 29 +++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index accc7c6c57f..c3f119e99e0 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -230,8 +230,8 @@ static unsigned long calculate_numa_remap_pages(void) unsigned long size, reserve_pages = 0; for_each_online_node(nid) { - u64 node_end_target; - u64 node_end_final; + u64 node_kva_target; + u64 node_kva_final; /* * The acpi/srat node info can show hot-add memroy zones @@ -254,42 +254,45 @@ static unsigned long calculate_numa_remap_pages(void) /* now the roundup is correct, convert to PAGE_SIZE pages */ size = size * PTRS_PER_PTE; - node_end_target = round_down(node_end_pfn[nid] - size, + node_kva_target = round_down(node_end_pfn[nid] - size, PTRS_PER_PTE); - node_end_target <<= PAGE_SHIFT; + node_kva_target <<= PAGE_SHIFT; do { - node_end_final = find_e820_area(node_end_target, + node_kva_final = find_e820_area(node_kva_target, ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + node_kva_target -= LARGE_PAGE_BYTES; + } while (node_kva_final == -1ULL && + (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); - if (node_end_final == -1ULL) + if (node_kva_final == -1ULL) panic("Can not get kva ram\n"); - printk("Reserving %ld pages of KVA for lmem_map of node %d\n", - size, nid); node_remap_size[nid] = size; node_remap_offset[nid] = reserve_pages; reserve_pages += size; - printk("Shrinking node %d from %ld pages to %lld pages\n", - nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT); + printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", + size, nid, node_kva_final>>PAGE_SHIFT); /* * prevent kva address below max_low_pfn want it on system * with less memory later. * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then max_low_pfn + * but we could have some hole in high memory, and it will only + * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide + * to use it as free. + * So reserve_early here, hope we don't run out of that array */ - if ((node_end_final>>PAGE_SHIFT) < max_low_pfn) - reserve_early(node_end_final, - node_end_final+(((u64)size)<>PAGE_SHIFT; - node_remap_start_pfn[nid] = node_end_pfn[nid]; - shrink_active_range(nid, node_end_pfn[nid]); + reserve_early(node_kva_final, + node_kva_final+(((u64)size)<>PAGE_SHIFT; + remove_active_range(nid, node_remap_start_pfn[nid], + node_remap_start_pfn[nid] + size); } printk("Reserving total of %ld pages for numa KVA remap\n", reserve_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index ce8e397a61f..034a3156d2f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -998,7 +998,8 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, extern void free_area_init_nodes(unsigned long *max_zone_pfn); extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn); +extern void remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn); extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_all_active_ranges(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee5ba7509c..d80e1868e57 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3552,30 +3552,47 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /** - * shrink_active_range - Shrink an existing registered range of PFNs + * remove_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @new_end_pfn: The new PFN of the range + * @start_pfn: The new PFN of the range + * @end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. * The map is kept near the end physical page range that has already been * registered. This function allows an arch to shrink an existing registered * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) +void __init remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) { int i, j; int removed = 0; + printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", + nid, start_pfn, end_pfn); + /* Find the old active region end and shrink */ for_each_active_range_index_in_nid(i, nid) { - if (early_node_map[i].start_pfn >= new_end_pfn) { + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn <= end_pfn) { /* clear it */ + early_node_map[i].start_pfn = 0; early_node_map[i].end_pfn = 0; removed = 1; continue; } - if (early_node_map[i].end_pfn > new_end_pfn) { - early_node_map[i].end_pfn = new_end_pfn; + if (early_node_map[i].start_pfn < start_pfn && + early_node_map[i].end_pfn > start_pfn) { + unsigned long temp_end_pfn = early_node_map[i].end_pfn; + early_node_map[i].end_pfn = start_pfn; + if (temp_end_pfn > end_pfn) + add_active_range(nid, end_pfn, temp_end_pfn); + continue; + } + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn > end_pfn && + early_node_map[i].start_pfn < end_pfn) { + early_node_map[i].start_pfn = end_pfn; continue; } } -- cgit v1.2.3-70-g09d2 From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 14 Jun 2008 18:32:52 -0700 Subject: x86, mm: use add_highpages_with_active_regions() for high pages init v2 use early_node_map to init high pages, so we can remove page_is_ram() and page_is_reserved_early() in the big loop with add_one_highpage also remove page_is_reserved_early(), it is not needed anymore. v2: fix the build of other platforms Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 11 -------- arch/x86/mm/discontig_32.c | 19 ++++++-------- arch/x86/mm/init_32.c | 62 ++++++++++++++++++++++++++++++++++++++-------- include/asm-x86/e820.h | 1 - include/asm-x86/highmem.h | 3 +++ include/linux/mm.h | 2 ++ mm/page_alloc.c | 8 ++++++ 7 files changed, 71 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5051ce744b4..ed46b7a6bc1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -612,17 +612,6 @@ void __init free_early(u64 start, u64 end) early_res[j - 1].end = 0; } -int __init page_is_reserved_early(unsigned long pagenr) -{ - u64 start = (u64)pagenr << PAGE_SHIFT; - int i; - struct early_res *r; - - i = find_overlapped_early(start, start + PAGE_SIZE); - r = &early_res[i]; - return (i < MAX_EARLY_RES && r->end); -} - void __init early_res_to_bootmem(u64 start, u64 end) { int i; diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index c3f119e99e0..7c4d0255f8d 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -100,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, #endif extern unsigned long find_max_low_pfn(void); -extern void add_one_highpage_init(struct page *, int, int); extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) @@ -432,10 +431,10 @@ void __init set_highmem_pages_init(int bad_ppro) { #ifdef CONFIG_HIGHMEM struct zone *zone; - struct page *page; + int nid; for_each_zone(zone) { - unsigned long node_pfn, zone_start_pfn, zone_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; if (!is_highmem(zone)) continue; @@ -443,16 +442,12 @@ void __init set_highmem_pages_init(int bad_ppro) zone_start_pfn = zone->zone_start_pfn; zone_end_pfn = zone_start_pfn + zone->spanned_pages; + nid = zone_to_nid(zone); printk("Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, zone_to_nid(zone), - zone_start_pfn, zone_end_pfn); - - for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - add_one_highpage_init(page, node_pfn, bad_ppro); - } + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn, bad_ppro); } totalram_pages += totalhigh_pages; #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index abadb1da70d..ba07a489230 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -287,10 +287,10 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __init +add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) && - !page_is_reserved_early(pfn)) { + if (!(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); init_page_count(page); __free_page(page); @@ -299,18 +299,58 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) SetPageReserved(page); } +struct add_highpages_data { + unsigned long start_pfn; + unsigned long end_pfn; + int bad_ppro; +}; + +static void __init add_highpages_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + int node_pfn; + struct page *page; + unsigned long final_start_pfn, final_end_pfn; + struct add_highpages_data *data; + int bad_ppro; + + data = (struct add_highpages_data *)datax; + bad_ppro = data->bad_ppro; + + final_start_pfn = max(start_pfn, data->start_pfn); + final_end_pfn = min(end_pfn, data->end_pfn); + if (final_start_pfn >= final_end_pfn) + return; + + for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; + node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page, node_pfn, bad_ppro); + } + +} + +void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, + int bad_ppro) +{ + struct add_highpages_data data; + + data.start_pfn = start_pfn; + data.end_pfn = end_pfn; + data.bad_ppro = bad_ppro; + + work_with_active_regions(nid, add_highpages_work_fn, &data); +} + #ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { - int pfn; + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn, + bad_ppro); - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { - /* - * Holes under sparsemem might not have no mem_map[]: - */ - if (pfn_valid(pfn)) - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); - } totalram_pages += totalhigh_pages; } #endif /* !CONFIG_NUMA */ diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 6b0ce745a60..55d31059690 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -86,7 +86,6 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); -extern int page_is_reserved_early(unsigned long pagenr); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); extern unsigned long e820_end_of_ram(void); diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h index e153f3b4477..85c4fea41ff 100644 --- a/include/asm-x86/highmem.h +++ b/include/asm-x86/highmem.h @@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn, int bad_ppro); + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 034a3156d2f..e4de460907c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,6 +1011,8 @@ extern unsigned long find_min_pfn_with_active_regions(void); extern unsigned long find_max_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); +typedef void (*work_fn_t)(unsigned long, unsigned long, void *); +extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID extern int early_pfn_to_nid(unsigned long pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d80e1868e57..41c6e3aa059 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid, } } +void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, + data); +} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. -- cgit v1.2.3-70-g09d2 From 3461b0af025251bbc6b3d56c821c6ac2de6f7209 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: x86: remove static boot_cpu_pda array v2 * Remove the boot_cpu_pda array and pointer table from the data section. Allocate the pointer table and array during init. do_boot_cpu() will reallocate the pda in node local memory and if the cpu is being brought up before the bootmem array is released (after_bootmem = 0), then it will free the initial pda. This will happen for all cpus present at system startup. This removes 512k + 32k bytes from the data section. For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/head64.c | 26 +++++++++++++++-- arch/x86/kernel/setup.c | 73 ++++++++++++++++++++++++++++++++++++----------- arch/x86/kernel/setup64.c | 8 ++++-- arch/x86/kernel/smpboot.c | 59 +++++++++++++++++++++++++++++--------- include/asm-x86/pda.h | 6 ++-- include/linux/mm.h | 1 + 6 files changed, 135 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index e25c57b8aa8..0ab59edd706 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -25,6 +25,24 @@ #include #include +/* boot cpu pda */ +static struct x8664_pda _boot_cpu_pda __read_mostly; + +#ifdef CONFIG_SMP +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +/* + * We install an empty cpu_pda pointer table to trap references before + * the actual cpu_pda pointer table is created in setup_cpu_pda_map(). + */ +static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; +#else +static struct x8664_pda *__cpu_pda[1] __read_mostly; +#endif + +#else /* !CONFIG_SMP (NR_CPUS will be 1) */ +static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; +#endif + static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); @@ -156,10 +174,12 @@ void __init x86_64_start_kernel(char * real_mode_data) early_printk("Kernel alive\n"); - for (i = 0; i < NR_CPUS; i++) - cpu_pda(i) = &boot_cpu_pda[i]; - + _cpu_pda = __cpu_pda; + cpu_pda(0) = &_boot_cpu_pda; pda_init(0); + + early_printk("Kernel really alive\n"); + copy_bootdata(__va(real_mode_data)); reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 913af838c3c..dd12c1c84a8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -101,6 +101,50 @@ static inline void setup_cpumask_of_cpu(void) { } */ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); +static inline void setup_cpu_pda_map(void) { } + +#elif !defined(CONFIG_SMP) +static inline void setup_cpu_pda_map(void) { } + +#else /* CONFIG_SMP && CONFIG_X86_64 */ + +/* + * Allocate cpu_pda pointer table and array via alloc_bootmem. + */ +static void __init setup_cpu_pda_map(void) +{ + char *pda; + struct x8664_pda **new_cpu_pda; + unsigned long size; + int cpu; + + size = roundup(sizeof(struct x8664_pda), cache_line_size()); + + /* allocate cpu_pda array and pointer table */ + { + unsigned long tsize = nr_cpu_ids * sizeof(void *); + unsigned long asize = size * (nr_cpu_ids - 1); + + tsize = roundup(tsize, cache_line_size()); + new_cpu_pda = alloc_bootmem(tsize + asize); + pda = (char *)new_cpu_pda + tsize; + } + + /* initialize pointer table to static pda's */ + for_each_possible_cpu(cpu) { + if (cpu == 0) { + /* leave boot cpu pda in place */ + new_cpu_pda[0] = cpu_pda(0); + continue; + } + new_cpu_pda[cpu] = (struct x8664_pda *)pda; + new_cpu_pda[cpu]->in_bootmem = 1; + pda += size; + } + + /* point to new pointer table */ + _cpu_pda = new_cpu_pda; +} #endif /* @@ -110,46 +154,43 @@ EXPORT_SYMBOL(__per_cpu_offset); */ void __init setup_per_cpu_areas(void) { - int i, highest_cpu = 0; - unsigned long size; + ssize_t size = PERCPU_ENOUGH_ROOM; + char *ptr; + int cpu; #ifdef CONFIG_HOTPLUG_CPU prefill_possible_map(); +#else + nr_cpu_ids = num_processors; #endif + /* Setup cpu_pda map */ + setup_cpu_pda_map(); + /* Copy section for each CPU (we discard the original) */ size = PERCPU_ENOUGH_ROOM; printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); - for_each_possible_cpu(i) { - char *ptr; + for_each_possible_cpu(cpu) { #ifndef CONFIG_NEED_MULTIPLE_NODES ptr = alloc_bootmem_pages(size); #else - int node = early_cpu_to_node(i); + int node = early_cpu_to_node(cpu); if (!node_online(node) || !NODE_DATA(node)) { ptr = alloc_bootmem_pages(size); printk(KERN_INFO "cpu %d has no node %d or node-local memory\n", - i, node); + cpu, node); } else ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); #endif - if (!ptr) - panic("Cannot allocate cpu data for CPU %d\n", i); -#ifdef CONFIG_X86_64 - cpu_pda(i)->data_offset = ptr - __per_cpu_start; -#else - __per_cpu_offset[i] = ptr - __per_cpu_start; -#endif + per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - highest_cpu = i; } - nr_cpu_ids = highest_cpu + 1; printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", NR_CPUS, nr_cpu_ids, nr_node_ids); @@ -199,7 +240,7 @@ void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - if (node != NUMA_NO_NODE) + if (cpu_pda(cpu) && node != NUMA_NO_NODE) cpu_pda(cpu)->nodenumber = node; if (cpu_to_node_map) diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index aee0e820077..631ea6cc01d 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -34,9 +35,8 @@ struct boot_params boot_params; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; +struct x8664_pda **_cpu_pda __read_mostly; EXPORT_SYMBOL(_cpu_pda); -struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; @@ -114,8 +114,10 @@ void pda_init(int cpu) __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); if (!pda->irqstackptr) panic("cannot allocate irqstack for cpu %d", cpu); - } + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } pda->irqstackptr += IRQSTACKSIZE-64; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 036604d3dae..bf083348745 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -816,6 +816,43 @@ static void __cpuinit do_fork_idle(struct work_struct *work) complete(&c_idle->done); } +/* + * Allocate node local memory for the AP pda. + * + * Must be called after the _cpu_pda pointer table is initialized. + */ +static int __cpuinit get_local_pda(int cpu) +{ + struct x8664_pda *oldpda, *newpda; + unsigned long size = sizeof(struct x8664_pda); + int node = cpu_to_node(cpu); + + if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem) + return 0; + + oldpda = cpu_pda(cpu); + newpda = kmalloc_node(size, GFP_ATOMIC, node); + if (!newpda) { + printk(KERN_ERR "Could not allocate node local PDA " + "for CPU %d on node %d\n", cpu, node); + + if (oldpda) + return 0; /* have a usable pda */ + else + return -1; + } + + if (oldpda) { + memcpy(newpda, oldpda, size); + if (!after_bootmem) + free_bootmem((unsigned long)oldpda, size); + } + + newpda->in_bootmem = 0; + cpu_pda(cpu) = newpda; + return 0; +} + static int __cpuinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad @@ -841,19 +878,11 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) } /* Allocate node local memory for AP pdas */ - if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { - struct x8664_pda *newpda, *pda; - int node = cpu_to_node(cpu); - pda = cpu_pda(cpu); - newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC, - node); - if (newpda) { - memcpy(newpda, pda, sizeof(struct x8664_pda)); - cpu_pda(cpu) = newpda; - } else - printk(KERN_ERR - "Could not allocate node local PDA for CPU %d on node %d\n", - cpu, node); + if (cpu > 0) { + boot_error = get_local_pda(cpu); + if (boot_error) + goto restore_state; + /* if can't get pda memory, can't start cpu */ } #endif @@ -972,6 +1001,8 @@ do_rest: } } +restore_state: + if (boot_error) { /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); @@ -1347,6 +1378,8 @@ __init void prefill_possible_map(void) for (i = 0; i < possible; i++) cpu_set(i, cpu_possible_map); + + nr_cpu_ids = possible; } static void __ref remove_cpu_from_maps(int cpu) diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h index de2ad9ac35a..b34e9a7cc80 100644 --- a/include/asm-x86/pda.h +++ b/include/asm-x86/pda.h @@ -22,7 +22,8 @@ struct x8664_pda { offset 40!!! */ #endif char *irqstackptr; - int nodenumber; /* number of current node */ + short nodenumber; /* number of current node (32k max) */ + short in_bootmem; /* pda lives in bootmem */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ short mmu_state; @@ -38,8 +39,7 @@ struct x8664_pda { unsigned irq_spurious_count; } ____cacheline_aligned_in_smp; -extern struct x8664_pda *_cpu_pda[]; -extern struct x8664_pda boot_cpu_pda[]; +extern struct x8664_pda **_cpu_pda; extern void pda_init(int); #define cpu_pda(i) (_cpu_pda[i]) diff --git a/include/linux/mm.h b/include/linux/mm.h index 586a943cab0..0ea48a5af82 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1024,6 +1024,7 @@ extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); +extern int after_bootmem; #ifdef CONFIG_NUMA extern void setup_per_cpu_pageset(void); -- cgit v1.2.3-70-g09d2 From a7bf0bd5e6af7fe69342dabf2a3b721f0163469a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 28 May 2008 15:02:14 +0100 Subject: build: add __page_aligned_data and __page_aligned_bss Making a variable page-aligned by using __attribute__((section(".data.page_aligned"))) is fragile because if sizeof(variable) is not also a multiple of page size, it leaves variables in the remainder of the section unaligned. This patch introduces two new qualifiers, __page_aligned_data and __page_aligned_bss to set the section *and* the alignment of variables. This makes page-aligned variables more robust because the linker will make sure they're aligned properly. Unfortunately it requires *all* page-aligned data to use these macros... Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup64.c | 2 +- arch/x86/mm/ioremap.c | 3 +-- include/linux/linkage.h | 4 ++++ 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index 631ea6cc01d..fc1a56da824 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -40,7 +40,7 @@ EXPORT_SYMBOL(_cpu_pda); struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; -char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; unsigned long __supported_pte_mask __read_mostly = ~0UL; EXPORT_SYMBOL_GPL(__supported_pte_mask); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 416ea415f5c..0561fde56a5 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -394,8 +394,7 @@ static int __init early_ioremap_debug_setup(char *str) early_param("early_ioremap_debug", early_ioremap_debug_setup); static __initdata int after_paging_init; -static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] - __section(.bss.page_aligned); +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 2119610b24f..9fd1f859021 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -1,6 +1,7 @@ #ifndef _LINUX_LINKAGE_H #define _LINUX_LINKAGE_H +#include #include #ifdef __cplusplus @@ -17,6 +18,9 @@ # define asmregparm #endif +#define __page_aligned_data __section(.data.page_aligned) __aligned(PAGE_SIZE) +#define __page_aligned_bss __section(.bss.page_aligned) __aligned(PAGE_SIZE) + /* * This is used by architectures to keep arguments on the stack * untouched by the compiler by keeping them live until the end. -- cgit v1.2.3-70-g09d2 From d52d53b8a5b258bfaab9223a5e7284fcfdd48577 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 20:10:55 -0700 Subject: RFC x86: try to remove arch_get_ram_range want to remove arch_get_ram_range, and use early_node_map instead. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 6 ++++-- drivers/pci/intel-iommu.c | 51 ++++++++++++++++++++++++++++++++++------------- include/linux/mm.h | 2 +- mm/page_alloc.c | 10 +++++++--- 4 files changed, 49 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 65d55056b6e..a0484adbf59 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -298,7 +298,7 @@ struct add_highpages_data { unsigned long end_pfn; }; -static void __init add_highpages_work_fn(unsigned long start_pfn, +static int __init add_highpages_work_fn(unsigned long start_pfn, unsigned long end_pfn, void *datax) { int node_pfn; @@ -311,7 +311,7 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, final_start_pfn = max(start_pfn, data->start_pfn); final_end_pfn = min(end_pfn, data->end_pfn); if (final_start_pfn >= final_end_pfn) - return; + return 0; for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; node_pfn++) { @@ -321,6 +321,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn, add_one_highpage_init(page, node_pfn); } + return 0; + } void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 66c0fd21894..bb0642318a9 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1637,12 +1637,43 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, } #ifdef CONFIG_DMAR_GFX_WA -extern int arch_get_ram_range(int slot, u64 *addr, u64 *size); +struct iommu_prepare_data { + struct pci_dev *pdev; + int ret; +}; + +static int __init iommu_prepare_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + struct iommu_prepare_data *data; + + data = (struct iommu_prepare_data *)datax; + + data->ret = iommu_prepare_identity_map(data->pdev, + start_pfn<ret; + +} + +static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev) +{ + int nid; + struct iommu_prepare_data data; + + data.pdev = pdev; + data.ret = 0; + + for_each_online_node(nid) { + work_with_active_regions(nid, iommu_prepare_work_fn, &data); + if (data.ret) + return data.ret; + } + return data.ret; +} + static void __init iommu_prepare_gfx_mapping(void) { struct pci_dev *pdev = NULL; - u64 base, size; - int slot; int ret; for_each_pci_dev(pdev) { @@ -1651,17 +1682,9 @@ static void __init iommu_prepare_gfx_mapping(void) continue; printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n", pci_name(pdev)); - slot = arch_get_ram_range(0, &base, &size); - while (slot >= 0) { - ret = iommu_prepare_identity_map(pdev, - base, base + size); - if (ret) - goto error; - slot = arch_get_ram_range(slot, &base, &size); - } - continue; -error: - printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); + ret = iommu_prepare_with_active_regions(pdev); + if (ret) + printk(KERN_ERR "IOMMU: mapping reserved region failed\n"); } } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d647b24041..cf1cd3a2ed7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,7 +1011,7 @@ extern unsigned long find_min_pfn_with_active_regions(void); extern unsigned long find_max_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); -typedef void (*work_fn_t)(unsigned long, unsigned long, void *); +typedef int (*work_fn_t)(unsigned long, unsigned long, void *); extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 41c6e3aa059..e25b6b24f84 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2932,10 +2932,14 @@ void __init free_bootmem_with_active_regions(int nid, void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; + int ret; - for_each_active_range_index_in_nid(i, nid) - work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, - data); + for_each_active_range_index_in_nid(i, nid) { + ret = work_fn(early_node_map[i].start_pfn, + early_node_map[i].end_pfn, data); + if (ret) + break; + } } /** * sparse_memory_present_with_active_regions - Call memory_present for each active range -- cgit v1.2.3-70-g09d2 From 3c999f142665265afd0fe9190204dd051f17e505 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 20 Jun 2008 16:11:20 -0700 Subject: x86: check command line when CONFIG_X86_MPPARSE is not set, v2 if acpi=off, acpi=noirq and pci=noacpi, we need to disable apic. Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: "Maciej W. Rozycki" Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 14 ++++++++++++++ arch/x86/kernel/apic_32.c | 2 +- arch/x86/kernel/setup.c | 4 ++++ arch/x86/kernel/setup_32.c | 5 +++++ arch/x86/kernel/setup_64.c | 9 +++++++++ include/asm-x86/apic.h | 6 +++++- include/linux/acpi.h | 6 ++++++ 7 files changed, 44 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6516359922b..5c0107602b6 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1787,6 +1787,20 @@ static int __init parse_pci(char *arg) } early_param("pci", parse_pci); +int __init acpi_mps_check(void) +{ +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) +/* mptable code is not built-in*/ + if (acpi_disabled || acpi_noirq) { + printk(KERN_WARNING "MPS support code is not built-in.\n" + "Using acpi=off or acpi=noirq or pci=noacpi " + "may have problem\n"); + return 1; + } +#endif + return 0; +} + #ifdef CONFIG_X86_IO_APIC static int __init parse_acpi_skip_timer_override(char *arg) { diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index dd8de26b278..4932d7813bc 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -57,7 +57,7 @@ unsigned long mp_lapic_addr; * * -1=force-disable, +1=force-enable */ -static int enable_local_apic __initdata; +int enable_local_apic; /* Local APIC timer verification ok */ static int local_apic_timer_verify_ok; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c5330f601b6..56aee55cf8d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -161,6 +161,10 @@ void __init setup_per_cpu_areas(void) char *ptr; int cpu; + /* no processor from mptable or madt */ + if (!num_processors) + num_processors = 1; + #ifdef CONFIG_HOTPLUG_CPU prefill_possible_map(); #else diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 369d0fe1ff9..cad4e893df0 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -677,6 +677,11 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); + if (acpi_mps_check()){ + enable_local_apic = -1; + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + } + finish_e820_parsing(); probe_roms(); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index a93300de4da..175c696ec53 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -302,6 +302,11 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); + if (acpi_mps_check()) { + disable_apic = 1; + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + } + #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT if (init_ohci1394_dma_early) init_ohci1394_dma_on_all_controllers(); @@ -723,6 +728,10 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) cpu_devs[c->x86_vendor]->c_early_init(c); validate_pat_support(c); + + /* early_param could clear that, but recall get it set again */ + if (disable_apic) + clear_cpu_cap(c, X86_FEATURE_APIC); } /* diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h index 313bcaf4b6c..9fe941cd843 100644 --- a/include/asm-x86/apic.h +++ b/include/asm-x86/apic.h @@ -39,8 +39,12 @@ extern int apic_verbosity; extern int local_apic_timer_c2_ok; extern int ioapic_force; -extern int disable_apic; +#ifdef CONFIG_X86_64 +extern int disable_apic; +#else +extern int enable_local_apic; +#endif /* * Basic functions accessing APICs. */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 41f7ce7edd7..0601075d09a 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -82,6 +82,7 @@ char * __acpi_map_table (unsigned long phys_addr, unsigned long size); int early_acpi_boot_init(void); int acpi_boot_init (void); int acpi_boot_table_init (void); +int acpi_mps_check (void); int acpi_numa_init (void); int acpi_table_init (void); @@ -250,6 +251,11 @@ static inline int acpi_boot_table_init(void) return 0; } +static inline int acpi_mps_check(void) +{ + return 0; +} + static inline int acpi_check_resource_conflict(struct resource *res) { return 0; -- cgit v1.2.3-70-g09d2 From 69ac9cd629ca96e59f34eb4ccd12d00b2c8276a7 Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Fri, 27 Jun 2008 13:12:54 +0200 Subject: sysfs: add /sys/firmware/memmap This patch adds /sys/firmware/memmap interface that represents the BIOS (or Firmware) provided memory map. The tree looks like: /sys/firmware/memmap/0/start (hex number) end (hex number) type (string) ... /1/start end type With the following shell snippet one can print the memory map in the same form the kernel prints itself when booting on x86 (the E820 map). --------- 8< -------------------------- #!/bin/sh cd /sys/firmware/memmap for dir in * ; do start=$(cat $dir/start) end=$(cat $dir/end) type=$(cat $dir/type) printf "%016x-%016x (%s)\n" $start $[ $end +1] "$type" done --------- >8 -------------------------- That patch only provides the needed interface: 1. The sysfs interface. 2. The structure and enumeration definition. 3. The function firmware_map_add() and firmware_map_add_early() that should be called from architecture code (E820/EFI, for example) to add the contents to the interface. If the kernel is compiled without CONFIG_FIRMWARE_MEMMAP, the interface does nothing without cluttering the architecture-specific code with #ifdef's. The purpose of the new interface is kexec: While /proc/iomem represents the *used* memory map (e.g. modified via kernel parameters like 'memmap' and 'mem'), the /sys/firmware/memmap tree represents the unmodified memory map provided via the firmware. So kexec can: - use the original memory map for rebooting, - use the /proc/iomem for setting up the ELF core headers for kdump case that should only represent the memory of the system. The patch has been tested on i386 and x86_64. Signed-off-by: Bernhard Walle Acked-by: Greg KH Acked-by: Vivek Goyal Cc: kexec@lists.infradead.org Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar --- Documentation/ABI/testing/sysfs-firmware-memmap | 71 ++++++++ drivers/firmware/Kconfig | 10 ++ drivers/firmware/Makefile | 1 + drivers/firmware/memmap.c | 205 ++++++++++++++++++++++++ include/linux/firmware-map.h | 74 +++++++++ 5 files changed, 361 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-firmware-memmap create mode 100644 drivers/firmware/memmap.c create mode 100644 include/linux/firmware-map.h (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-firmware-memmap b/Documentation/ABI/testing/sysfs-firmware-memmap new file mode 100644 index 00000000000..0d99ee6ae02 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-memmap @@ -0,0 +1,71 @@ +What: /sys/firmware/memmap/ +Date: June 2008 +Contact: Bernhard Walle +Description: + On all platforms, the firmware provides a memory map which the + kernel reads. The resources from that memory map are registered + in the kernel resource tree and exposed to userspace via + /proc/iomem (together with other resources). + + However, on most architectures that firmware-provided memory + map is modified afterwards by the kernel itself, either because + the kernel merges that memory map with other information or + just because the user overwrites that memory map via command + line. + + kexec needs the raw firmware-provided memory map to setup the + parameter segment of the kernel that should be booted with + kexec. Also, the raw memory map is useful for debugging. For + that reason, /sys/firmware/memmap is an interface that provides + the raw memory map to userspace. + + The structure is as follows: Under /sys/firmware/memmap there + are subdirectories with the number of the entry as their name: + + /sys/firmware/memmap/0 + /sys/firmware/memmap/1 + /sys/firmware/memmap/2 + /sys/firmware/memmap/3 + ... + + The maximum depends on the number of memory map entries provided + by the firmware. The order is just the order that the firmware + provides. + + Each directory contains three files: + + start : The start address (as hexadecimal number with the + '0x' prefix). + end : The end address, inclusive (regardless whether the + firmware provides inclusive or exclusive ranges). + type : Type of the entry as string. See below for a list of + valid types. + + So, for example: + + /sys/firmware/memmap/0/start + /sys/firmware/memmap/0/end + /sys/firmware/memmap/0/type + /sys/firmware/memmap/1/start + ... + + Currently following types exist: + + - System RAM + - ACPI Tables + - ACPI Non-volatile Storage + - reserved + + Following shell snippet can be used to display that memory + map in a human-readable format: + + -------------------- 8< ---------------------------------------- + #!/bin/bash + cd /sys/firmware/memmap + for dir in * ; do + start=$(cat $dir/start) + end=$(cat $dir/end) + type=$(cat $dir/type) + printf "%016x-%016x (%s)\n" $start $[ $end +1] "$type" + done + -------------------- >8 ---------------------------------------- diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index dc2cec6127d..ebb9e51deb0 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -26,6 +26,16 @@ config EDD_OFF kernel. Say N if you want EDD enabled by default. EDD can be dynamically set using the kernel parameter 'edd={on|skipmbr|off}'. +config FIRMWARE_MEMMAP + bool "Add firmware-provided memory map to sysfs" if EMBEDDED + default (X86_64 || X86_32) + help + Add the firmware-provided (unmodified) memory map to /sys/firmware/memmap. + That memory map is used for example by kexec to set up parameter area + for the next kernel, but can also be used for debugging purposes. + + See also Documentation/ABI/testing/sysfs-firmware-memmap. + config EFI_VARS tristate "EFI Variable Support via sysfs" depends on EFI diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile index 4c9147154df..1c3c17343db 100644 --- a/drivers/firmware/Makefile +++ b/drivers/firmware/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_DCDBAS) += dcdbas.o obj-$(CONFIG_DMIID) += dmi-id.o obj-$(CONFIG_ISCSI_IBFT_FIND) += iscsi_ibft_find.o obj-$(CONFIG_ISCSI_IBFT) += iscsi_ibft.o +obj-$(CONFIG_FIRMWARE_MEMMAP) += memmap.o diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c new file mode 100644 index 00000000000..e23399c7f77 --- /dev/null +++ b/drivers/firmware/memmap.c @@ -0,0 +1,205 @@ +/* + * linux/drivers/firmware/memmap.c + * Copyright (C) 2008 SUSE LINUX Products GmbH + * by Bernhard Walle + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2.0 as published by + * the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include + +/* + * Data types ------------------------------------------------------------------ + */ + +/* + * Firmware map entry. Because firmware memory maps are flat and not + * hierarchical, it's ok to organise them in a linked list. No parent + * information is necessary as for the resource tree. + */ +struct firmware_map_entry { + resource_size_t start; /* start of the memory range */ + resource_size_t end; /* end of the memory range (incl.) */ + const char *type; /* type of the memory range */ + struct list_head list; /* entry for the linked list */ + struct kobject kobj; /* kobject for each entry */ +}; + +/* + * Forward declarations -------------------------------------------------------- + */ +static ssize_t memmap_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf); +static ssize_t start_show(struct firmware_map_entry *entry, char *buf); +static ssize_t end_show(struct firmware_map_entry *entry, char *buf); +static ssize_t type_show(struct firmware_map_entry *entry, char *buf); + +/* + * Static data ----------------------------------------------------------------- + */ + +struct memmap_attribute { + struct attribute attr; + ssize_t (*show)(struct firmware_map_entry *entry, char *buf); +}; + +struct memmap_attribute memmap_start_attr = __ATTR_RO(start); +struct memmap_attribute memmap_end_attr = __ATTR_RO(end); +struct memmap_attribute memmap_type_attr = __ATTR_RO(type); + +/* + * These are default attributes that are added for every memmap entry. + */ +static struct attribute *def_attrs[] = { + &memmap_start_attr.attr, + &memmap_end_attr.attr, + &memmap_type_attr.attr, + NULL +}; + +static struct sysfs_ops memmap_attr_ops = { + .show = memmap_attr_show, +}; + +static struct kobj_type memmap_ktype = { + .sysfs_ops = &memmap_attr_ops, + .default_attrs = def_attrs, +}; + +/* + * Registration functions ------------------------------------------------------ + */ + +/* + * Firmware memory map entries + */ +static LIST_HEAD(map_entries); + +/** + * Common implementation of firmware_map_add() and firmware_map_add_early() + * which expects a pre-allocated struct firmware_map_entry. + * + * @start: Start of the memory range. + * @end: End of the memory range (inclusive). + * @type: Type of the memory range. + * @entry: Pre-allocated (either kmalloc() or bootmem allocator), uninitialised + * entry. + */ +static int firmware_map_add_entry(resource_size_t start, resource_size_t end, + const char *type, + struct firmware_map_entry *entry) +{ + BUG_ON(start > end); + + entry->start = start; + entry->end = end; + entry->type = type; + INIT_LIST_HEAD(&entry->list); + kobject_init(&entry->kobj, &memmap_ktype); + + list_add_tail(&entry->list, &map_entries); + + return 0; +} + +/* + * See for documentation. + */ +int firmware_map_add(resource_size_t start, resource_size_t end, + const char *type) +{ + struct firmware_map_entry *entry; + + entry = kmalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC); + WARN_ON(!entry); + if (!entry) + return -ENOMEM; + + return firmware_map_add_entry(start, end, type, entry); +} + +/* + * See for documentation. + */ +int __init firmware_map_add_early(resource_size_t start, resource_size_t end, + const char *type) +{ + struct firmware_map_entry *entry; + + entry = alloc_bootmem_low(sizeof(struct firmware_map_entry)); + WARN_ON(!entry); + if (!entry) + return -ENOMEM; + + return firmware_map_add_entry(start, end, type, entry); +} + +/* + * Sysfs functions ------------------------------------------------------------- + */ + +static ssize_t start_show(struct firmware_map_entry *entry, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%llx\n", entry->start); +} + +static ssize_t end_show(struct firmware_map_entry *entry, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "0x%llx\n", entry->end); +} + +static ssize_t type_show(struct firmware_map_entry *entry, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", entry->type); +} + +#define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, attr) +#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj) + +static ssize_t memmap_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct firmware_map_entry *entry = to_memmap_entry(kobj); + struct memmap_attribute *memmap_attr = to_memmap_attr(attr); + + return memmap_attr->show(entry, buf); +} + +/* + * Initialises stuff and adds the entries in the map_entries list to + * sysfs. Important is that firmware_map_add() and firmware_map_add_early() + * must be called before late_initcall. + */ +static int __init memmap_init(void) +{ + int i = 0; + struct firmware_map_entry *entry; + struct kset *memmap_kset; + + memmap_kset = kset_create_and_add("memmap", NULL, firmware_kobj); + WARN_ON(!memmap_kset); + if (!memmap_kset) + return -ENOMEM; + + list_for_each_entry(entry, &map_entries, list) { + entry->kobj.kset = memmap_kset; + kobject_add(&entry->kobj, NULL, "%d", i++); + } + + return 0; +} +late_initcall(memmap_init); + diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h new file mode 100644 index 00000000000..acbdbcc1605 --- /dev/null +++ b/include/linux/firmware-map.h @@ -0,0 +1,74 @@ +/* + * include/linux/firmware-map.h: + * Copyright (C) 2008 SUSE LINUX Products GmbH + * by Bernhard Walle + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2.0 as published by + * the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef _LINUX_FIRMWARE_MAP_H +#define _LINUX_FIRMWARE_MAP_H + +#include +#include + +/* + * provide a dummy interface if CONFIG_FIRMWARE_MEMMAP is disabled + */ +#ifdef CONFIG_FIRMWARE_MEMMAP + +/** + * Adds a firmware mapping entry. This function uses kmalloc() for memory + * allocation. Use firmware_map_add_early() if you want to use the bootmem + * allocator. + * + * That function must be called before late_initcall. + * + * @start: Start of the memory range. + * @end: End of the memory range (inclusive). + * @type: Type of the memory range. + * + * Returns 0 on success, or -ENOMEM if no memory could be allocated. + */ +int firmware_map_add(resource_size_t start, resource_size_t end, + const char *type); + +/** + * Adds a firmware mapping entry. This function uses the bootmem allocator + * for memory allocation. Use firmware_map_add() if you want to use kmalloc(). + * + * That function must be called before late_initcall. + * + * @start: Start of the memory range. + * @end: End of the memory range (inclusive). + * @type: Type of the memory range. + * + * Returns 0 on success, or -ENOMEM if no memory could be allocated. + */ +int firmware_map_add_early(resource_size_t start, resource_size_t end, + const char *type); + +#else /* CONFIG_FIRMWARE_MEMMAP */ + +static inline int firmware_map_add(resource_size_t start, resource_size_t end, + const char *type) +{ + return 0; +} + +static inline int firmware_map_add_early(resource_size_t start, + resource_size_t end, const char *type) +{ + return 0; +} + +#endif /* CONFIG_FIRMWARE_MEMMAP */ + +#endif /* _LINUX_FIRMWARE_MAP_H */ -- cgit v1.2.3-70-g09d2 From a861beb1401d65e3f095fee074c13645ab06490e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 8 Jul 2008 19:27:22 +0200 Subject: ide: add __ide_default_irq() inline helper Add __ide_default_irq() inline helper and use it instead of ide_default_irq() in ide-probe.c and ns87415.c (all host drivers except IDE PCI ones always setup hwif->irq so it is enough to check only for I/O bases 0x1f0 and 0x170). This fixes post-2.6.25 regression since ide_default_irq() define could shadow ide_default_irq() inline. Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-probe.c | 8 ++------ drivers/ide/pci/ns87415.c | 6 +----- include/linux/ide.h | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index d27061b3932..26e68b65b7c 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -1218,16 +1218,12 @@ static void drive_release_dev (struct device *dev) complete(&drive->gendev_rel_comp); } -#ifndef ide_default_irq -#define ide_default_irq(irq) 0 -#endif - static int hwif_init(ide_hwif_t *hwif) { int old_irq; if (!hwif->irq) { - hwif->irq = ide_default_irq(hwif->io_ports.data_addr); + hwif->irq = __ide_default_irq(hwif->io_ports.data_addr); if (!hwif->irq) { printk("%s: DISABLED, NO IRQ\n", hwif->name); return 0; @@ -1257,7 +1253,7 @@ static int hwif_init(ide_hwif_t *hwif) * It failed to initialise. Find the default IRQ for * this port and try that. */ - hwif->irq = ide_default_irq(hwif->io_ports.data_addr); + hwif->irq = __ide_default_irq(hwif->io_ports.data_addr); if (!hwif->irq) { printk("%s: Disabled unable to get IRQ %d.\n", hwif->name, old_irq); diff --git a/drivers/ide/pci/ns87415.c b/drivers/ide/pci/ns87415.c index fec4955f449..a7a41bb8277 100644 --- a/drivers/ide/pci/ns87415.c +++ b/drivers/ide/pci/ns87415.c @@ -225,10 +225,6 @@ static int ns87415_dma_setup(ide_drive_t *drive) return 1; } -#ifndef ide_default_irq -#define ide_default_irq(irq) 0 -#endif - static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif) { struct pci_dev *dev = to_pci_dev(hwif->dev); @@ -288,7 +284,7 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif) } if (!using_inta) - hwif->irq = ide_default_irq(hwif->io_ports.data_addr); + hwif->irq = __ide_default_irq(hwif->io_ports.data_addr); else if (!hwif->irq && hwif->mate && hwif->mate->irq) hwif->irq = hwif->mate->irq; /* share IRQ with mate */ diff --git a/include/linux/ide.h b/include/linux/ide.h index 9918772bf27..eddb6daadf4 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -189,6 +189,21 @@ static inline void ide_std_init_ports(hw_regs_t *hw, hw->io_ports.ctl_addr = ctl_addr; } +/* for IDE PCI controllers in legacy mode, temporary */ +static inline int __ide_default_irq(unsigned long base) +{ + switch (base) { +#ifdef CONFIG_IA64 + case 0x1f0: return isa_irq_to_vector(14); + case 0x170: return isa_irq_to_vector(15); +#else + case 0x1f0: return 14; + case 0x170: return 15; +#endif + } + return 0; +} + #include #if !defined(MAX_HWIFS) || defined(CONFIG_EMBEDDED) -- cgit v1.2.3-70-g09d2 From 004a403c2e954734090a69aedc7f4f822bdcc142 Mon Sep 17 00:00:00 2001 From: Loc Ho Date: Wed, 14 May 2008 20:41:47 +0800 Subject: [CRYPTO] hash: Add asynchronous hash support This patch adds asynchronous hash and digest support. Signed-off-by: Loc Ho Signed-off-by: Herbert Xu --- crypto/Makefile | 1 + crypto/ahash.c | 106 +++++++++++++++++++++++++++ crypto/api.c | 8 ++- crypto/digest.c | 81 +++++++++++++++++++++ crypto/hash.c | 102 +++++++++++++++++++++++--- crypto/internal.h | 1 + include/crypto/algapi.h | 36 ++++++++++ include/linux/crypto.h | 187 ++++++++++++++++++++++++++++++++++++++++++++++-- 8 files changed, 507 insertions(+), 15 deletions(-) create mode 100644 crypto/ahash.c (limited to 'include/linux') diff --git a/crypto/Makefile b/crypto/Makefile index 807656b64e0..d4f3ed857df 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_CRYPTO_BLKCIPHER) += crypto_blkcipher.o obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o crypto_hash-objs := hash.o +crypto_hash-objs += ahash.o obj-$(CONFIG_CRYPTO_HASH) += crypto_hash.o obj-$(CONFIG_CRYPTO_MANAGER) += cryptomgr.o diff --git a/crypto/ahash.c b/crypto/ahash.c new file mode 100644 index 00000000000..a83e035d9a3 --- /dev/null +++ b/crypto/ahash.c @@ -0,0 +1,106 @@ +/* + * Asynchronous Cryptographic Hash operations. + * + * This is the asynchronous version of hash.c with notification of + * completion via a callback. + * + * Copyright (c) 2008 Loc Ho + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static int ahash_setkey_unaligned(struct crypto_ahash *tfm, const u8 *key, + unsigned int keylen) +{ + struct ahash_alg *ahash = crypto_ahash_alg(tfm); + unsigned long alignmask = crypto_ahash_alignmask(tfm); + int ret; + u8 *buffer, *alignbuffer; + unsigned long absize; + + absize = keylen + alignmask; + buffer = kmalloc(absize, GFP_ATOMIC); + if (!buffer) + return -ENOMEM; + + alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); + memcpy(alignbuffer, key, keylen); + ret = ahash->setkey(tfm, alignbuffer, keylen); + memset(alignbuffer, 0, keylen); + kfree(buffer); + return ret; +} + +static int ahash_setkey(struct crypto_ahash *tfm, const u8 *key, + unsigned int keylen) +{ + struct ahash_alg *ahash = crypto_ahash_alg(tfm); + unsigned long alignmask = crypto_ahash_alignmask(tfm); + + if ((unsigned long)key & alignmask) + return ahash_setkey_unaligned(tfm, key, keylen); + + return ahash->setkey(tfm, key, keylen); +} + +static unsigned int crypto_ahash_ctxsize(struct crypto_alg *alg, u32 type, + u32 mask) +{ + return alg->cra_ctxsize; +} + +static int crypto_init_ahash_ops(struct crypto_tfm *tfm, u32 type, u32 mask) +{ + struct ahash_alg *alg = &tfm->__crt_alg->cra_ahash; + struct ahash_tfm *crt = &tfm->crt_ahash; + + if (alg->digestsize > crypto_tfm_alg_blocksize(tfm)) + return -EINVAL; + + crt->init = alg->init; + crt->update = alg->update; + crt->final = alg->final; + crt->digest = alg->digest; + crt->setkey = ahash_setkey; + crt->base = __crypto_ahash_cast(tfm); + crt->digestsize = alg->digestsize; + + return 0; +} + +static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg) + __attribute__ ((unused)); +static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg) +{ + seq_printf(m, "type : ahash\n"); + seq_printf(m, "async : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ? + "yes" : "no"); + seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); + seq_printf(m, "digestsize : %u\n", alg->cra_hash.digestsize); +} + +const struct crypto_type crypto_ahash_type = { + .ctxsize = crypto_ahash_ctxsize, + .init = crypto_init_ahash_ops, +#ifdef CONFIG_PROC_FS + .show = crypto_ahash_show, +#endif +}; +EXPORT_SYMBOL_GPL(crypto_ahash_type); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Asynchronous cryptographic hash type"); diff --git a/crypto/api.c b/crypto/api.c index 0a0f41ef255..d06e33270ab 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -235,8 +235,12 @@ static int crypto_init_ops(struct crypto_tfm *tfm, u32 type, u32 mask) return crypto_init_cipher_ops(tfm); case CRYPTO_ALG_TYPE_DIGEST: - return crypto_init_digest_ops(tfm); - + if ((mask & CRYPTO_ALG_TYPE_HASH_MASK) != + CRYPTO_ALG_TYPE_HASH_MASK) + return crypto_init_digest_ops_async(tfm); + else + return crypto_init_digest_ops(tfm); + case CRYPTO_ALG_TYPE_COMPRESS: return crypto_init_compress_ops(tfm); diff --git a/crypto/digest.c b/crypto/digest.c index b526cc348b7..025c9aea24e 100644 --- a/crypto/digest.c +++ b/crypto/digest.c @@ -157,3 +157,84 @@ int crypto_init_digest_ops(struct crypto_tfm *tfm) void crypto_exit_digest_ops(struct crypto_tfm *tfm) { } + +static int digest_async_nosetkey(struct crypto_ahash *tfm_async, const u8 *key, + unsigned int keylen) +{ + crypto_ahash_clear_flags(tfm_async, CRYPTO_TFM_RES_MASK); + return -ENOSYS; +} + +static int digest_async_setkey(struct crypto_ahash *tfm_async, const u8 *key, + unsigned int keylen) +{ + struct crypto_tfm *tfm = crypto_ahash_tfm(tfm_async); + struct digest_alg *dalg = &tfm->__crt_alg->cra_digest; + + crypto_ahash_clear_flags(tfm_async, CRYPTO_TFM_RES_MASK); + return dalg->dia_setkey(tfm, key, keylen); +} + +static int digest_async_init(struct ahash_request *req) +{ + struct crypto_tfm *tfm = req->base.tfm; + struct digest_alg *dalg = &tfm->__crt_alg->cra_digest; + + dalg->dia_init(tfm); + return 0; +} + +static int digest_async_update(struct ahash_request *req) +{ + struct crypto_tfm *tfm = req->base.tfm; + struct hash_desc desc = { + .tfm = __crypto_hash_cast(tfm), + .flags = req->base.flags, + }; + + update(&desc, req->src, req->nbytes); + return 0; +} + +static int digest_async_final(struct ahash_request *req) +{ + struct crypto_tfm *tfm = req->base.tfm; + struct hash_desc desc = { + .tfm = __crypto_hash_cast(tfm), + .flags = req->base.flags, + }; + + final(&desc, req->result); + return 0; +} + +static int digest_async_digest(struct ahash_request *req) +{ + struct crypto_tfm *tfm = req->base.tfm; + struct hash_desc desc = { + .tfm = __crypto_hash_cast(tfm), + .flags = req->base.flags, + }; + + return digest(&desc, req->src, req->nbytes, req->result); +} + +int crypto_init_digest_ops_async(struct crypto_tfm *tfm) +{ + struct ahash_tfm *crt = &tfm->crt_ahash; + struct digest_alg *dalg = &tfm->__crt_alg->cra_digest; + + if (dalg->dia_digestsize > crypto_tfm_alg_blocksize(tfm)) + return -EINVAL; + + crt->init = digest_async_init; + crt->update = digest_async_update; + crt->final = digest_async_final; + crt->digest = digest_async_digest; + crt->setkey = dalg->dia_setkey ? digest_async_setkey : + digest_async_nosetkey; + crt->digestsize = dalg->dia_digestsize; + crt->base = __crypto_ahash_cast(tfm); + + return 0; +} diff --git a/crypto/hash.c b/crypto/hash.c index 7dcff671c19..f9400a014e7 100644 --- a/crypto/hash.c +++ b/crypto/hash.c @@ -59,24 +59,108 @@ static int hash_setkey(struct crypto_hash *crt, const u8 *key, return alg->setkey(crt, key, keylen); } -static int crypto_init_hash_ops(struct crypto_tfm *tfm, u32 type, u32 mask) +static int hash_async_setkey(struct crypto_ahash *tfm_async, const u8 *key, + unsigned int keylen) +{ + struct crypto_tfm *tfm = crypto_ahash_tfm(tfm_async); + struct crypto_hash *tfm_hash = __crypto_hash_cast(tfm); + struct hash_alg *alg = &tfm->__crt_alg->cra_hash; + + return alg->setkey(tfm_hash, key, keylen); +} + +static int hash_async_init(struct ahash_request *req) +{ + struct crypto_tfm *tfm = req->base.tfm; + struct hash_alg *alg = &tfm->__crt_alg->cra_hash; + struct hash_desc desc = { + .tfm = __crypto_hash_cast(tfm), + .flags = req->base.flags, + }; + + return alg->init(&desc); +} + +static int hash_async_update(struct ahash_request *req) +{ + struct crypto_tfm *tfm = req->base.tfm; + struct hash_alg *alg = &tfm->__crt_alg->cra_hash; + struct hash_desc desc = { + .tfm = __crypto_hash_cast(tfm), + .flags = req->base.flags, + }; + + return alg->update(&desc, req->src, req->nbytes); +} + +static int hash_async_final(struct ahash_request *req) +{ + struct crypto_tfm *tfm = req->base.tfm; + struct hash_alg *alg = &tfm->__crt_alg->cra_hash; + struct hash_desc desc = { + .tfm = __crypto_hash_cast(tfm), + .flags = req->base.flags, + }; + + return alg->final(&desc, req->result); +} + +static int hash_async_digest(struct ahash_request *req) +{ + struct crypto_tfm *tfm = req->base.tfm; + struct hash_alg *alg = &tfm->__crt_alg->cra_hash; + struct hash_desc desc = { + .tfm = __crypto_hash_cast(tfm), + .flags = req->base.flags, + }; + + return alg->digest(&desc, req->src, req->nbytes, req->result); +} + +static int crypto_init_hash_ops_async(struct crypto_tfm *tfm) +{ + struct ahash_tfm *crt = &tfm->crt_ahash; + struct hash_alg *alg = &tfm->__crt_alg->cra_hash; + + crt->init = hash_async_init; + crt->update = hash_async_update; + crt->final = hash_async_final; + crt->digest = hash_async_digest; + crt->setkey = hash_async_setkey; + crt->digestsize = alg->digestsize; + crt->base = __crypto_ahash_cast(tfm); + + return 0; +} + +static int crypto_init_hash_ops_sync(struct crypto_tfm *tfm) { struct hash_tfm *crt = &tfm->crt_hash; struct hash_alg *alg = &tfm->__crt_alg->cra_hash; - if (alg->digestsize > crypto_tfm_alg_blocksize(tfm)) - return -EINVAL; - - crt->init = alg->init; - crt->update = alg->update; - crt->final = alg->final; - crt->digest = alg->digest; - crt->setkey = hash_setkey; + crt->init = alg->init; + crt->update = alg->update; + crt->final = alg->final; + crt->digest = alg->digest; + crt->setkey = hash_setkey; crt->digestsize = alg->digestsize; return 0; } +static int crypto_init_hash_ops(struct crypto_tfm *tfm, u32 type, u32 mask) +{ + struct hash_alg *alg = &tfm->__crt_alg->cra_hash; + + if (alg->digestsize > crypto_tfm_alg_blocksize(tfm)) + return -EINVAL; + + if ((mask & CRYPTO_ALG_TYPE_HASH_MASK) != CRYPTO_ALG_TYPE_HASH_MASK) + return crypto_init_hash_ops_async(tfm); + else + return crypto_init_hash_ops_sync(tfm); +} + static void crypto_hash_show(struct seq_file *m, struct crypto_alg *alg) __attribute__ ((unused)); static void crypto_hash_show(struct seq_file *m, struct crypto_alg *alg) diff --git a/crypto/internal.h b/crypto/internal.h index 32f4c214560..683fcb2d91f 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -86,6 +86,7 @@ struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type, u32 mask); struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask); int crypto_init_digest_ops(struct crypto_tfm *tfm); +int crypto_init_digest_ops_async(struct crypto_tfm *tfm); int crypto_init_cipher_ops(struct crypto_tfm *tfm); int crypto_init_compress_ops(struct crypto_tfm *tfm); diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index 60d06e784be..fef272a8cee 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -98,6 +98,7 @@ extern const struct crypto_type crypto_ablkcipher_type; extern const struct crypto_type crypto_aead_type; extern const struct crypto_type crypto_blkcipher_type; extern const struct crypto_type crypto_hash_type; +extern const struct crypto_type crypto_ahash_type; void crypto_mod_put(struct crypto_alg *alg); @@ -314,5 +315,40 @@ static inline int crypto_requires_sync(u32 type, u32 mask) return (type ^ CRYPTO_ALG_ASYNC) & mask & CRYPTO_ALG_ASYNC; } +static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm) +{ + return crypto_tfm_ctx(&tfm->base); +} + +static inline struct ahash_alg *crypto_ahash_alg( + struct crypto_ahash *tfm) +{ + return &crypto_ahash_tfm(tfm)->__crt_alg->cra_ahash; +} + +static inline int ahash_enqueue_request(struct crypto_queue *queue, + struct ahash_request *request) +{ + return crypto_enqueue_request(queue, &request->base); +} + +static inline struct ahash_request *ahash_dequeue_request( + struct crypto_queue *queue) +{ + return ahash_request_cast(crypto_dequeue_request(queue)); +} + +static inline void *ahash_request_ctx(struct ahash_request *req) +{ + return req->__ctx; +} + +static inline int ahash_tfm_in_queue(struct crypto_queue *queue, + struct crypto_ahash *tfm) +{ + return crypto_tfm_in_queue(queue, crypto_ahash_tfm(tfm)); +} + + #endif /* _CRYPTO_ALGAPI_H */ diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 425824bd49f..b6efe569128 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -30,15 +30,17 @@ */ #define CRYPTO_ALG_TYPE_MASK 0x0000000f #define CRYPTO_ALG_TYPE_CIPHER 0x00000001 -#define CRYPTO_ALG_TYPE_DIGEST 0x00000002 -#define CRYPTO_ALG_TYPE_HASH 0x00000003 +#define CRYPTO_ALG_TYPE_COMPRESS 0x00000002 +#define CRYPTO_ALG_TYPE_AEAD 0x00000003 #define CRYPTO_ALG_TYPE_BLKCIPHER 0x00000004 #define CRYPTO_ALG_TYPE_ABLKCIPHER 0x00000005 #define CRYPTO_ALG_TYPE_GIVCIPHER 0x00000006 -#define CRYPTO_ALG_TYPE_COMPRESS 0x00000008 -#define CRYPTO_ALG_TYPE_AEAD 0x00000009 +#define CRYPTO_ALG_TYPE_DIGEST 0x00000008 +#define CRYPTO_ALG_TYPE_HASH 0x00000009 +#define CRYPTO_ALG_TYPE_AHASH 0x0000000a #define CRYPTO_ALG_TYPE_HASH_MASK 0x0000000e +#define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000c #define CRYPTO_ALG_TYPE_BLKCIPHER_MASK 0x0000000c #define CRYPTO_ALG_LARVAL 0x00000010 @@ -102,6 +104,7 @@ struct crypto_async_request; struct crypto_aead; struct crypto_blkcipher; struct crypto_hash; +struct crypto_ahash; struct crypto_tfm; struct crypto_type; struct aead_givcrypt_request; @@ -131,6 +134,18 @@ struct ablkcipher_request { void *__ctx[] CRYPTO_MINALIGN_ATTR; }; +struct ahash_request { + struct crypto_async_request base; + + void *info; + + unsigned int nbytes; + struct scatterlist *src; + u8 *result; + + void *__ctx[] CRYPTO_MINALIGN_ATTR; +}; + /** * struct aead_request - AEAD request * @base: Common attributes for async crypto requests @@ -195,6 +210,17 @@ struct ablkcipher_alg { unsigned int ivsize; }; +struct ahash_alg { + int (*init)(struct ahash_request *req); + int (*update)(struct ahash_request *req); + int (*final)(struct ahash_request *req); + int (*digest)(struct ahash_request *req); + int (*setkey)(struct crypto_ahash *tfm, const u8 *key, + unsigned int keylen); + + unsigned int digestsize; +}; + struct aead_alg { int (*setkey)(struct crypto_aead *tfm, const u8 *key, unsigned int keylen); @@ -272,6 +298,7 @@ struct compress_alg { #define cra_cipher cra_u.cipher #define cra_digest cra_u.digest #define cra_hash cra_u.hash +#define cra_ahash cra_u.ahash #define cra_compress cra_u.compress struct crypto_alg { @@ -298,6 +325,7 @@ struct crypto_alg { struct cipher_alg cipher; struct digest_alg digest; struct hash_alg hash; + struct ahash_alg ahash; struct compress_alg compress; } cra_u; @@ -383,6 +411,19 @@ struct hash_tfm { unsigned int digestsize; }; +struct ahash_tfm { + int (*init)(struct ahash_request *req); + int (*update)(struct ahash_request *req); + int (*final)(struct ahash_request *req); + int (*digest)(struct ahash_request *req); + int (*setkey)(struct crypto_ahash *tfm, const u8 *key, + unsigned int keylen); + + unsigned int digestsize; + struct crypto_ahash *base; + unsigned int reqsize; +}; + struct compress_tfm { int (*cot_compress)(struct crypto_tfm *tfm, const u8 *src, unsigned int slen, @@ -397,6 +438,7 @@ struct compress_tfm { #define crt_blkcipher crt_u.blkcipher #define crt_cipher crt_u.cipher #define crt_hash crt_u.hash +#define crt_ahash crt_u.ahash #define crt_compress crt_u.compress struct crypto_tfm { @@ -409,6 +451,7 @@ struct crypto_tfm { struct blkcipher_tfm blkcipher; struct cipher_tfm cipher; struct hash_tfm hash; + struct ahash_tfm ahash; struct compress_tfm compress; } crt_u; @@ -441,6 +484,10 @@ struct crypto_hash { struct crypto_tfm base; }; +struct crypto_ahash { + struct crypto_tfm base; +}; + enum { CRYPTOA_UNSPEC, CRYPTOA_ALG, @@ -1264,5 +1311,137 @@ static inline int crypto_comp_decompress(struct crypto_comp *tfm, src, slen, dst, dlen); } +static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm) +{ + return (struct crypto_ahash *)tfm; +} + +static inline struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, + u32 type, u32 mask) +{ + type &= ~CRYPTO_ALG_TYPE_MASK; + mask &= ~CRYPTO_ALG_TYPE_MASK; + type |= CRYPTO_ALG_TYPE_AHASH; + mask |= CRYPTO_ALG_TYPE_AHASH_MASK; + + return __crypto_ahash_cast(crypto_alloc_base(alg_name, type, mask)); +} + +static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm) +{ + return &tfm->base; +} + +static inline void crypto_free_ahash(struct crypto_ahash *tfm) +{ + crypto_free_tfm(crypto_ahash_tfm(tfm)); +} + +static inline unsigned int crypto_ahash_alignmask( + struct crypto_ahash *tfm) +{ + return crypto_tfm_alg_alignmask(crypto_ahash_tfm(tfm)); +} + +static inline struct ahash_tfm *crypto_ahash_crt(struct crypto_ahash *tfm) +{ + return &crypto_ahash_tfm(tfm)->crt_ahash; +} + +static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm) +{ + return crypto_ahash_crt(tfm)->digestsize; +} + +static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm) +{ + return crypto_tfm_get_flags(crypto_ahash_tfm(tfm)); +} + +static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags) +{ + crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags); +} + +static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags) +{ + crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags); +} + +static inline struct crypto_ahash *crypto_ahash_reqtfm( + struct ahash_request *req) +{ + return __crypto_ahash_cast(req->base.tfm); +} + +static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm) +{ + return crypto_ahash_crt(tfm)->reqsize; +} + +static inline int crypto_ahash_setkey(struct crypto_ahash *tfm, + const u8 *key, unsigned int keylen) +{ + struct ahash_tfm *crt = crypto_ahash_crt(tfm); + + return crt->setkey(crt->base, key, keylen); +} + +static inline int crypto_ahash_digest(struct ahash_request *req) +{ + struct ahash_tfm *crt = crypto_ahash_crt(crypto_ahash_reqtfm(req)); + return crt->digest(req); +} + +static inline void ahash_request_set_tfm(struct ahash_request *req, + struct crypto_ahash *tfm) +{ + req->base.tfm = crypto_ahash_tfm(crypto_ahash_crt(tfm)->base); +} + +static inline struct ahash_request *ahash_request_alloc( + struct crypto_ahash *tfm, gfp_t gfp) +{ + struct ahash_request *req; + + req = kmalloc(sizeof(struct ahash_request) + + crypto_ahash_reqsize(tfm), gfp); + + if (likely(req)) + ahash_request_set_tfm(req, tfm); + + return req; +} + +static inline void ahash_request_free(struct ahash_request *req) +{ + kfree(req); +} + +static inline struct ahash_request *ahash_request_cast( + struct crypto_async_request *req) +{ + return container_of(req, struct ahash_request, base); +} + +static inline void ahash_request_set_callback(struct ahash_request *req, + u32 flags, + crypto_completion_t complete, + void *data) +{ + req->base.complete = complete; + req->base.data = data; + req->base.flags = flags; +} + +static inline void ahash_request_set_crypt(struct ahash_request *req, + struct scatterlist *src, u8 *result, + unsigned int nbytes) +{ + req->src = src; + req->nbytes = nbytes; + req->result = result; +} + #endif /* _LINUX_CRYPTO_H */ -- cgit v1.2.3-70-g09d2 From 166247f46a9c866e6f7f7d2212be875fb82212a1 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 7 Jul 2008 20:54:35 +0800 Subject: crypto: hash - Removed vestigial ahash fields The base field in ahash_tfm appears to have been cut-n-pasted from ablkcipher. It isn't needed here at all. Similarly, the info field in ahash_request also appears to have originated from its cipher counter-part and is vestigial. Signed-off-by: Herbert Xu --- crypto/ahash.c | 1 - crypto/digest.c | 1 - crypto/hash.c | 1 - include/linux/crypto.h | 7 ++----- 4 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/crypto/ahash.c b/crypto/ahash.c index 8c1f918a687..e6e5906ca80 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -76,7 +76,6 @@ static int crypto_init_ahash_ops(struct crypto_tfm *tfm, u32 type, u32 mask) crt->final = alg->final; crt->digest = alg->digest; crt->setkey = ahash_setkey; - crt->base = __crypto_ahash_cast(tfm); crt->digestsize = alg->digestsize; return 0; diff --git a/crypto/digest.c b/crypto/digest.c index d63d5d96fee..bf332982c50 100644 --- a/crypto/digest.c +++ b/crypto/digest.c @@ -234,7 +234,6 @@ int crypto_init_digest_ops_async(struct crypto_tfm *tfm) crt->setkey = dalg->dia_setkey ? digest_async_setkey : digest_async_nosetkey; crt->digestsize = dalg->dia_digestsize; - crt->base = __crypto_ahash_cast(tfm); return 0; } diff --git a/crypto/hash.c b/crypto/hash.c index 0d7caa9ab74..140a75565f1 100644 --- a/crypto/hash.c +++ b/crypto/hash.c @@ -128,7 +128,6 @@ static int crypto_init_hash_ops_async(struct crypto_tfm *tfm) crt->digest = hash_async_digest; crt->setkey = hash_async_setkey; crt->digestsize = alg->digestsize; - crt->base = __crypto_ahash_cast(tfm); return 0; } diff --git a/include/linux/crypto.h b/include/linux/crypto.h index b6efe569128..68ef293644d 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -137,8 +137,6 @@ struct ablkcipher_request { struct ahash_request { struct crypto_async_request base; - void *info; - unsigned int nbytes; struct scatterlist *src; u8 *result; @@ -420,7 +418,6 @@ struct ahash_tfm { unsigned int keylen); unsigned int digestsize; - struct crypto_ahash *base; unsigned int reqsize; }; @@ -1384,7 +1381,7 @@ static inline int crypto_ahash_setkey(struct crypto_ahash *tfm, { struct ahash_tfm *crt = crypto_ahash_crt(tfm); - return crt->setkey(crt->base, key, keylen); + return crt->setkey(tfm, key, keylen); } static inline int crypto_ahash_digest(struct ahash_request *req) @@ -1396,7 +1393,7 @@ static inline int crypto_ahash_digest(struct ahash_request *req) static inline void ahash_request_set_tfm(struct ahash_request *req, struct crypto_ahash *tfm) { - req->base.tfm = crypto_ahash_tfm(crypto_ahash_crt(tfm)->base); + req->base.tfm = crypto_ahash_tfm(tfm); } static inline struct ahash_request *ahash_request_alloc( -- cgit v1.2.3-70-g09d2 From 18e33e6d5cc0495826f5245777cd267732815e01 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 10 Jul 2008 16:01:22 +0800 Subject: crypto: hash - Move ahash functions into crypto/hash.h All new crypto interfaces should go into individual files as much as possible in order to ensure that crypto.h does not collapse under its own weight. This patch moves the ahash code into crypto/hash.h and crypto/internal/hash.h respectively. Signed-off-by: Herbert Xu --- crypto/cryptd.c | 1 + crypto/digest.c | 1 + crypto/hash.c | 1 + crypto/tcrypt.c | 1 + include/crypto/algapi.h | 36 ---------- include/crypto/hash.h | 154 +++++++++++++++++++++++++++++++++++++++++ include/crypto/internal/hash.h | 37 ++++++++++ include/linux/crypto.h | 136 ------------------------------------ 8 files changed, 195 insertions(+), 172 deletions(-) create mode 100644 include/crypto/hash.h (limited to 'include/linux') diff --git a/crypto/cryptd.c b/crypto/cryptd.c index d3ecd7e73b7..d29e06b350f 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include diff --git a/crypto/digest.c b/crypto/digest.c index bf332982c50..ac0919460d1 100644 --- a/crypto/digest.c +++ b/crypto/digest.c @@ -12,6 +12,7 @@ * */ +#include #include #include #include diff --git a/crypto/hash.c b/crypto/hash.c index 140a75565f1..cb86b19fd10 100644 --- a/crypto/hash.c +++ b/crypto/hash.c @@ -9,6 +9,7 @@ * any later version. */ +#include #include #include #include diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 87f08f9b090..59821a22d75 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -15,6 +15,7 @@ * */ +#include #include #include #include diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index fef272a8cee..60d06e784be 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -98,7 +98,6 @@ extern const struct crypto_type crypto_ablkcipher_type; extern const struct crypto_type crypto_aead_type; extern const struct crypto_type crypto_blkcipher_type; extern const struct crypto_type crypto_hash_type; -extern const struct crypto_type crypto_ahash_type; void crypto_mod_put(struct crypto_alg *alg); @@ -315,40 +314,5 @@ static inline int crypto_requires_sync(u32 type, u32 mask) return (type ^ CRYPTO_ALG_ASYNC) & mask & CRYPTO_ALG_ASYNC; } -static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm) -{ - return crypto_tfm_ctx(&tfm->base); -} - -static inline struct ahash_alg *crypto_ahash_alg( - struct crypto_ahash *tfm) -{ - return &crypto_ahash_tfm(tfm)->__crt_alg->cra_ahash; -} - -static inline int ahash_enqueue_request(struct crypto_queue *queue, - struct ahash_request *request) -{ - return crypto_enqueue_request(queue, &request->base); -} - -static inline struct ahash_request *ahash_dequeue_request( - struct crypto_queue *queue) -{ - return ahash_request_cast(crypto_dequeue_request(queue)); -} - -static inline void *ahash_request_ctx(struct ahash_request *req) -{ - return req->__ctx; -} - -static inline int ahash_tfm_in_queue(struct crypto_queue *queue, - struct crypto_ahash *tfm) -{ - return crypto_tfm_in_queue(queue, crypto_ahash_tfm(tfm)); -} - - #endif /* _CRYPTO_ALGAPI_H */ diff --git a/include/crypto/hash.h b/include/crypto/hash.h new file mode 100644 index 00000000000..d12498ec8a4 --- /dev/null +++ b/include/crypto/hash.h @@ -0,0 +1,154 @@ +/* + * Hash: Hash algorithms under the crypto API + * + * Copyright (c) 2008 Herbert Xu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#ifndef _CRYPTO_HASH_H +#define _CRYPTO_HASH_H + +#include + +struct crypto_ahash { + struct crypto_tfm base; +}; + +static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm) +{ + return (struct crypto_ahash *)tfm; +} + +static inline struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, + u32 type, u32 mask) +{ + type &= ~CRYPTO_ALG_TYPE_MASK; + mask &= ~CRYPTO_ALG_TYPE_MASK; + type |= CRYPTO_ALG_TYPE_AHASH; + mask |= CRYPTO_ALG_TYPE_AHASH_MASK; + + return __crypto_ahash_cast(crypto_alloc_base(alg_name, type, mask)); +} + +static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm) +{ + return &tfm->base; +} + +static inline void crypto_free_ahash(struct crypto_ahash *tfm) +{ + crypto_free_tfm(crypto_ahash_tfm(tfm)); +} + +static inline unsigned int crypto_ahash_alignmask( + struct crypto_ahash *tfm) +{ + return crypto_tfm_alg_alignmask(crypto_ahash_tfm(tfm)); +} + +static inline struct ahash_tfm *crypto_ahash_crt(struct crypto_ahash *tfm) +{ + return &crypto_ahash_tfm(tfm)->crt_ahash; +} + +static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm) +{ + return crypto_ahash_crt(tfm)->digestsize; +} + +static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm) +{ + return crypto_tfm_get_flags(crypto_ahash_tfm(tfm)); +} + +static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags) +{ + crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags); +} + +static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags) +{ + crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags); +} + +static inline struct crypto_ahash *crypto_ahash_reqtfm( + struct ahash_request *req) +{ + return __crypto_ahash_cast(req->base.tfm); +} + +static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm) +{ + return crypto_ahash_crt(tfm)->reqsize; +} + +static inline int crypto_ahash_setkey(struct crypto_ahash *tfm, + const u8 *key, unsigned int keylen) +{ + struct ahash_tfm *crt = crypto_ahash_crt(tfm); + + return crt->setkey(tfm, key, keylen); +} + +static inline int crypto_ahash_digest(struct ahash_request *req) +{ + struct ahash_tfm *crt = crypto_ahash_crt(crypto_ahash_reqtfm(req)); + return crt->digest(req); +} + +static inline void ahash_request_set_tfm(struct ahash_request *req, + struct crypto_ahash *tfm) +{ + req->base.tfm = crypto_ahash_tfm(tfm); +} + +static inline struct ahash_request *ahash_request_alloc( + struct crypto_ahash *tfm, gfp_t gfp) +{ + struct ahash_request *req; + + req = kmalloc(sizeof(struct ahash_request) + + crypto_ahash_reqsize(tfm), gfp); + + if (likely(req)) + ahash_request_set_tfm(req, tfm); + + return req; +} + +static inline void ahash_request_free(struct ahash_request *req) +{ + kfree(req); +} + +static inline struct ahash_request *ahash_request_cast( + struct crypto_async_request *req) +{ + return container_of(req, struct ahash_request, base); +} + +static inline void ahash_request_set_callback(struct ahash_request *req, + u32 flags, + crypto_completion_t complete, + void *data) +{ + req->base.complete = complete; + req->base.data = data; + req->base.flags = flags; +} + +static inline void ahash_request_set_crypt(struct ahash_request *req, + struct scatterlist *src, u8 *result, + unsigned int nbytes) +{ + req->src = src; + req->nbytes = nbytes; + req->result = result; +} + +#endif /* _CRYPTO_HASH_H */ diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h index 93ac4228022..917ae57bad4 100644 --- a/include/crypto/internal/hash.h +++ b/include/crypto/internal/hash.h @@ -14,6 +14,7 @@ #define _CRYPTO_INTERNAL_HASH_H #include +#include struct ahash_request; struct scatterlist; @@ -33,9 +34,45 @@ struct crypto_hash_walk { unsigned int flags; }; +extern const struct crypto_type crypto_ahash_type; + int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err); int crypto_hash_walk_first(struct ahash_request *req, struct crypto_hash_walk *walk); +static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm) +{ + return crypto_tfm_ctx(&tfm->base); +} + +static inline struct ahash_alg *crypto_ahash_alg( + struct crypto_ahash *tfm) +{ + return &crypto_ahash_tfm(tfm)->__crt_alg->cra_ahash; +} + +static inline int ahash_enqueue_request(struct crypto_queue *queue, + struct ahash_request *request) +{ + return crypto_enqueue_request(queue, &request->base); +} + +static inline struct ahash_request *ahash_dequeue_request( + struct crypto_queue *queue) +{ + return ahash_request_cast(crypto_dequeue_request(queue)); +} + +static inline void *ahash_request_ctx(struct ahash_request *req) +{ + return req->__ctx; +} + +static inline int ahash_tfm_in_queue(struct crypto_queue *queue, + struct crypto_ahash *tfm) +{ + return crypto_tfm_in_queue(queue, crypto_ahash_tfm(tfm)); +} + #endif /* _CRYPTO_INTERNAL_HASH_H */ diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 68ef293644d..c43dc47fdf7 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -481,10 +481,6 @@ struct crypto_hash { struct crypto_tfm base; }; -struct crypto_ahash { - struct crypto_tfm base; -}; - enum { CRYPTOA_UNSPEC, CRYPTOA_ALG, @@ -1308,137 +1304,5 @@ static inline int crypto_comp_decompress(struct crypto_comp *tfm, src, slen, dst, dlen); } -static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm) -{ - return (struct crypto_ahash *)tfm; -} - -static inline struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, - u32 type, u32 mask) -{ - type &= ~CRYPTO_ALG_TYPE_MASK; - mask &= ~CRYPTO_ALG_TYPE_MASK; - type |= CRYPTO_ALG_TYPE_AHASH; - mask |= CRYPTO_ALG_TYPE_AHASH_MASK; - - return __crypto_ahash_cast(crypto_alloc_base(alg_name, type, mask)); -} - -static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm) -{ - return &tfm->base; -} - -static inline void crypto_free_ahash(struct crypto_ahash *tfm) -{ - crypto_free_tfm(crypto_ahash_tfm(tfm)); -} - -static inline unsigned int crypto_ahash_alignmask( - struct crypto_ahash *tfm) -{ - return crypto_tfm_alg_alignmask(crypto_ahash_tfm(tfm)); -} - -static inline struct ahash_tfm *crypto_ahash_crt(struct crypto_ahash *tfm) -{ - return &crypto_ahash_tfm(tfm)->crt_ahash; -} - -static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm) -{ - return crypto_ahash_crt(tfm)->digestsize; -} - -static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm) -{ - return crypto_tfm_get_flags(crypto_ahash_tfm(tfm)); -} - -static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags) -{ - crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags); -} - -static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags) -{ - crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags); -} - -static inline struct crypto_ahash *crypto_ahash_reqtfm( - struct ahash_request *req) -{ - return __crypto_ahash_cast(req->base.tfm); -} - -static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm) -{ - return crypto_ahash_crt(tfm)->reqsize; -} - -static inline int crypto_ahash_setkey(struct crypto_ahash *tfm, - const u8 *key, unsigned int keylen) -{ - struct ahash_tfm *crt = crypto_ahash_crt(tfm); - - return crt->setkey(tfm, key, keylen); -} - -static inline int crypto_ahash_digest(struct ahash_request *req) -{ - struct ahash_tfm *crt = crypto_ahash_crt(crypto_ahash_reqtfm(req)); - return crt->digest(req); -} - -static inline void ahash_request_set_tfm(struct ahash_request *req, - struct crypto_ahash *tfm) -{ - req->base.tfm = crypto_ahash_tfm(tfm); -} - -static inline struct ahash_request *ahash_request_alloc( - struct crypto_ahash *tfm, gfp_t gfp) -{ - struct ahash_request *req; - - req = kmalloc(sizeof(struct ahash_request) + - crypto_ahash_reqsize(tfm), gfp); - - if (likely(req)) - ahash_request_set_tfm(req, tfm); - - return req; -} - -static inline void ahash_request_free(struct ahash_request *req) -{ - kfree(req); -} - -static inline struct ahash_request *ahash_request_cast( - struct crypto_async_request *req) -{ - return container_of(req, struct ahash_request, base); -} - -static inline void ahash_request_set_callback(struct ahash_request *req, - u32 flags, - crypto_completion_t complete, - void *data) -{ - req->base.complete = complete; - req->base.data = data; - req->base.flags = flags; -} - -static inline void ahash_request_set_crypt(struct ahash_request *req, - struct scatterlist *src, u8 *result, - unsigned int nbytes) -{ - req->src = src; - req->nbytes = nbytes; - req->result = result; -} - #endif /* _LINUX_CRYPTO_H */ -- cgit v1.2.3-70-g09d2 From b7a39bd0afc4021e8ad2b1189e884551e147427f Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 23 May 2008 18:38:49 +0100 Subject: firmware: make fw->data const In preparation for supporting firmware files linked into the static kernel, make fw->data const to ensure that users aren't modifying it (so that we can pass a pointer to the original in-kernel copy, rather than having to copy it). Signed-off-by: David Woodhouse --- drivers/base/firmware_class.c | 2 +- include/linux/firmware.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c index 9fd4a853414..264b3a2cd86 100644 --- a/drivers/base/firmware_class.c +++ b/drivers/base/firmware_class.c @@ -257,7 +257,7 @@ firmware_data_write(struct kobject *kobj, struct bin_attribute *bin_attr, if (retval) goto out; - memcpy(fw->data + offset, buffer, count); + memcpy((u8 *)fw->data + offset, buffer, count); fw->size = max_t(size_t, offset + count, fw->size); retval = count; diff --git a/include/linux/firmware.h b/include/linux/firmware.h index 6c7eff2ebad..88718d60153 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -8,7 +8,7 @@ struct firmware { size_t size; - u8 *data; + const u8 *data; }; struct device; -- cgit v1.2.3-70-g09d2 From 5658c769443d543728b6c5c673dffc2df8676317 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 23 May 2008 13:52:42 +0100 Subject: firmware: allow firmware files to be built into kernel image Some drivers have their own hacks to bypass the kernel's firmware loader and build their firmware into the kernel; this renders those unnecessary. Other drivers don't use the firmware loader at all, because they always want the firmware to be available. This allows them to start using the firmware loader. A third set of drivers already use the firmware loader, but can't be used without help from userspace, which sometimes requires an initrd. This allows them to work in a static kernel. Signed-off-by: David Woodhouse --- drivers/base/firmware_class.c | 33 +++++++++++++++++++++++++++++++-- include/asm-generic/vmlinux.lds.h | 7 +++++++ include/linux/firmware.h | 21 +++++++++++++++++++++ 3 files changed, 59 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c index 264b3a2cd86..b0be1d18fee 100644 --- a/drivers/base/firmware_class.c +++ b/drivers/base/firmware_class.c @@ -49,6 +49,14 @@ struct firmware_priv { struct timer_list timeout; }; +#ifdef CONFIG_FW_LOADER +extern struct builtin_fw __start_builtin_fw[]; +extern struct builtin_fw __end_builtin_fw[]; +#else /* Module case. Avoid ifdefs later; it'll all optimise out */ +static struct builtin_fw *__start_builtin_fw; +static struct builtin_fw *__end_builtin_fw; +#endif + static void fw_load_abort(struct firmware_priv *fw_priv) { @@ -391,13 +399,12 @@ _request_firmware(const struct firmware **firmware_p, const char *name, struct device *f_dev; struct firmware_priv *fw_priv; struct firmware *firmware; + struct builtin_fw *builtin; int retval; if (!firmware_p) return -EINVAL; - printk(KERN_INFO "firmware: requesting %s\n", name); - *firmware_p = firmware = kzalloc(sizeof(*firmware), GFP_KERNEL); if (!firmware) { printk(KERN_ERR "%s: kmalloc(struct firmware) failed\n", @@ -406,6 +413,20 @@ _request_firmware(const struct firmware **firmware_p, const char *name, goto out; } + for (builtin = __start_builtin_fw; builtin != __end_builtin_fw; + builtin++) { + if (strcmp(name, builtin->name)) + continue; + printk(KERN_INFO "firmware: using built-in firmware %s\n", + name); + firmware->size = builtin->size; + firmware->data = builtin->data; + return 0; + } + + if (uevent) + printk(KERN_INFO "firmware: requesting %s\n", name); + retval = fw_setup_device(firmware, &f_dev, name, device, uevent); if (retval) goto error_kfree_fw; @@ -473,8 +494,16 @@ request_firmware(const struct firmware **firmware_p, const char *name, void release_firmware(const struct firmware *fw) { + struct builtin_fw *builtin; + if (fw) { + for (builtin = __start_builtin_fw; builtin != __end_builtin_fw; + builtin++) { + if (fw->data == builtin->data) + goto free_fw; + } vfree(fw->data); + free_fw: kfree(fw); } } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f054778e916..8d71a40625f 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -86,6 +86,13 @@ VMLINUX_SYMBOL(__end_pci_fixups_resume) = .; \ } \ \ + /* Built-in firmware blobs */ \ + .builtin_fw : AT(ADDR(.builtin_fw) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__start_builtin_fw) = .; \ + *(.builtin_fw) \ + VMLINUX_SYMBOL(__end_builtin_fw) = .; \ + } \ + \ /* RapidIO route ops */ \ .rio_route : AT(ADDR(.rio_route) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_rio_route_ops) = .; \ diff --git a/include/linux/firmware.h b/include/linux/firmware.h index 88718d60153..c8ecf5b2a20 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -1,7 +1,10 @@ #ifndef _LINUX_FIRMWARE_H #define _LINUX_FIRMWARE_H + #include #include +#include + #define FIRMWARE_NAME_MAX 30 #define FW_ACTION_NOHOTPLUG 0 #define FW_ACTION_HOTPLUG 1 @@ -13,6 +16,24 @@ struct firmware { struct device; +struct builtin_fw { + char *name; + void *data; + unsigned long size; +}; + +/* We have to play tricks here much like stringify() to get the + __COUNTER__ macro to be expanded as we want it */ +#define __fw_concat1(x, y) x##y +#define __fw_concat(x, y) __fw_concat1(x, y) + +#define DECLARE_BUILTIN_FIRMWARE(name, blob) \ + DECLARE_BUILTIN_FIRMWARE_SIZE(name, &(blob), sizeof(blob)) + +#define DECLARE_BUILTIN_FIRMWARE_SIZE(name, blob, size) \ + static const struct builtin_fw __fw_concat(__builtin_fw,__COUNTER__) \ + __used __section(.builtin_fw) = { name, blob, size } + #if defined(CONFIG_FW_LOADER) || (defined(CONFIG_FW_LOADER_MODULE) && defined(MODULE)) int request_firmware(const struct firmware **fw, const char *name, struct device *device); -- cgit v1.2.3-70-g09d2 From bacfe09dd7545467965e8d8f1eab20bc62dce00d Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 30 May 2008 13:57:27 +0300 Subject: ihex.h: binary representation of ihex records Some devices need their firmware as a set of {address, len, data...} records in some specific order rather than a simple blob. The normal way of doing this kind of thing is 'ihex', which is a text format and not entirely suitable for use in the kernel. This provides a binary representation which is very similar, but much more compact -- and a helper routine to skip to the next record, because the alignment constraints mean that everybody will screw it up for themselves otherwise. Also a helper function which can verify that a 'struct firmware' contains a valid set of ihex records, and that following them won't run off the end of the loaded data. Signed-off-by: David Woodhouse --- include/linux/ihex.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 include/linux/ihex.h (limited to 'include/linux') diff --git a/include/linux/ihex.h b/include/linux/ihex.h new file mode 100644 index 00000000000..df89edd890a --- /dev/null +++ b/include/linux/ihex.h @@ -0,0 +1,50 @@ +/* + * Compact binary representation of ihex records. Some devices need their + * firmware loaded in strange orders rather than a single big blob, but + * actually parsing ihex-as-text within the kernel seems silly. Thus,... + */ + +#ifndef __LINUX_IHEX_H__ +#define __LINUX_IHEX_H__ + +#include +#include + +/* Intel HEX files actually limit the length to 256 bytes, but we have + drivers which would benefit from using separate records which are + longer than that, so we extend to 16 bits of length */ +struct ihex_binrec { + __be32 addr; + __be16 len; + uint8_t data[0]; +} __attribute__((aligned(4))); + +/* Find the next record, taking into account the 4-byte alignment */ +static inline const struct ihex_binrec * +ihex_next_binrec(const struct ihex_binrec *rec) +{ + int next = ((be16_to_cpu(rec->len) + 5) & ~3) - 2; + rec = (void *)&rec->data[next]; + + return be16_to_cpu(rec->len) ? rec : NULL; +} + +/* Check that ihex_next_binrec() won't take us off the end of the image... */ +static inline int ihex_validate_fw(const struct firmware *fw) +{ + const struct ihex_binrec *rec; + size_t ofs = 0; + + while (ofs <= fw->size - sizeof(*rec)) { + rec = (void *)&fw->data[ofs]; + + /* Zero length marks end of records */ + if (!be16_to_cpu(rec->len)) + return 0; + + /* Point to next record... */ + ofs += (sizeof(*rec) + be16_to_cpu(rec->len) + 3) & ~3; + } + return -EINVAL; +} +#endif /* __LINUX_IHEX_H__ */ -- cgit v1.2.3-70-g09d2 From f1485f3deb89e6ae10c4d34662ec9e692855ab5d Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 31 May 2008 15:20:37 +0300 Subject: ihex: request_ihex_firmware() function to load and validate firmware Provide a helper to load the file and validate it in one call, to simplify error handling in the drivers which are going to use it. Signed-off-by: David Woodhouse --- include/linux/ihex.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ihex.h b/include/linux/ihex.h index df89edd890a..2baace2788a 100644 --- a/include/linux/ihex.h +++ b/include/linux/ihex.h @@ -9,6 +9,7 @@ #include #include +#include /* Intel HEX files actually limit the length to 256 bytes, but we have drivers which would benefit from using separate records which are @@ -47,4 +48,27 @@ static inline int ihex_validate_fw(const struct firmware *fw) } return -EINVAL; } + +/* Request firmware and validate it so that we can trust we won't + * run off the end while reading records... */ +static inline int request_ihex_firmware(const struct firmware **fw, + const char *fw_name, + struct device *dev) +{ + const struct firmware *lfw; + int ret; + + ret = request_firmware(&lfw, fw_name, dev); + if (ret) + return ret; + ret = ihex_validate_fw(lfw); + if (ret) { + dev_err(dev, "Firmware \"%s\" not valid IHEX records\n", + fw_name); + release_firmware(lfw); + return ret; + } + *fw = lfw; + return 0; +} #endif /* __LINUX_IHEX_H__ */ -- cgit v1.2.3-70-g09d2 From ccf9b3b83d0e56fbf20c00a08b15031ce13204a7 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Thu, 10 Jul 2008 16:55:37 -0700 Subject: xfrm: Add a XFRM_STATE_AF_UNSPEC flag to xfrm_usersa_info Add a XFRM_STATE_AF_UNSPEC flag to handle the AF_UNSPEC behavior for the selector family. Userspace applications can set this flag to leave the selector family of the xfrm_state unspecified. This can be used to to handle inter family tunnels if the selector is not set from userspace. Signed-off-by: Steffen Klassert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/xfrm.h | 1 + net/xfrm/xfrm_user.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 2ca6bae8872..fb0c215a305 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -339,6 +339,7 @@ struct xfrm_usersa_info { #define XFRM_STATE_NOPMTUDISC 4 #define XFRM_STATE_WILDRECV 8 #define XFRM_STATE_ICMP 16 +#define XFRM_STATE_AF_UNSPEC 32 }; struct xfrm_usersa_id { diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index b976d9ed10e..04c41504f84 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -277,9 +277,8 @@ static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info * memcpy(&x->props.saddr, &p->saddr, sizeof(x->props.saddr)); x->props.flags = p->flags; - if (!x->sel.family) + if (!x->sel.family && !(p->flags & XFRM_STATE_AF_UNSPEC)) x->sel.family = p->family; - } /* -- cgit v1.2.3-70-g09d2 From a2bb6a3d85ef3124cd336403a95abc0540d3fbe2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2008 20:58:15 -0400 Subject: ftrace: add ftrace_kill_atomic It has been suggested that I add a way to disable the function tracer on an oops. This code adds a ftrace_kill_atomic. It is not meant to be used in normal situations. It will disable the ftrace tracer, but will not perform the nice shutdown that requires scheduling. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 3121b95443d..f368d041e02 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -89,6 +89,7 @@ void ftrace_enable_daemon(void); /* totally disable ftrace - can not re-enable after this */ void ftrace_kill(void); +void ftrace_kill_atomic(void); static inline void tracer_disable(void) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0f271c45cd0..1359632668a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1601,6 +1601,21 @@ core_initcall(ftrace_dynamic_init); # define ftrace_force_shutdown() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ +/** + * ftrace_kill_atomic - kill ftrace from critical sections + * + * This function should be used by panic code. It stops ftrace + * but in a not so nice way. If you need to simply kill ftrace + * from a non-atomic section, use ftrace_kill. + */ +void ftrace_kill_atomic(void) +{ + ftrace_disabled = 1; + ftrace_enabled = 0; + ftraced_suspend = -1; + clear_ftrace_function(); +} + /** * ftrace_kill - totally shutdown ftrace * -- cgit v1.2.3-70-g09d2 From af52a90a14cdaa54ecbfb6e6982abb13466a4b56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:52 -0400 Subject: sched_clock: stop maximum check on NO HZ Working with ftrace I would get large jumps of 11 millisecs or more with the clock tracer. This killed the latencing timings of ftrace and also caused the irqoff self tests to fail. What was happening is with NO_HZ the idle would stop the jiffy counter and before the jiffy counter was updated the sched_clock would have a bad delta jiffies to compare with the gtod with the maximum. The jiffies would stop and the last sched_tick would record the last gtod. On wakeup, the sched clock update would compare the gtod + delta jiffies (which would be zero) and compare it to the TSC. The TSC would have correctly (with a stable TSC) moved forward several jiffies. But because the jiffies has not been updated yet the clock would be prevented from moving forward because it would appear that the TSC jumped too far ahead. The clock would then virtually stop, until the jiffies are updated. Then the next sched clock update would see that the clock was very much behind since the delta jiffies is now correct. This would then jump the clock forward by several jiffies. This caused ftrace to report several milliseconds of interrupts off latency at every resume from NO_HZ idle. This patch adds hooks into the nohz code to disable the checking of the maximum clock update when nohz is in effect. It resumes the max check when nohz has updated the jiffies again. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/sched.h | 17 ++++++++++++++++- kernel/sched_clock.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/time/tick-sched.c | 2 ++ 3 files changed, 56 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c5d3f847ca8..33a8f42041f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1573,13 +1573,28 @@ static inline void sched_clock_idle_sleep_event(void) static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } -#else + +#ifdef CONFIG_NO_HZ +static inline void sched_clock_tick_stop(int cpu) +{ +} + +static inline void sched_clock_tick_start(int cpu) +{ +} +#endif + +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ extern void sched_clock_init(void); extern u64 sched_clock_cpu(int cpu); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); +#ifdef CONFIG_NO_HZ +extern void sched_clock_tick_stop(int cpu); +extern void sched_clock_tick_start(int cpu); #endif +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 42b81fa38cb..97159e225a7 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -45,6 +45,9 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; +#ifdef CONFIG_NO_HZ + int check_max; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); @@ -76,11 +79,45 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; +#ifdef CONFIG_NO_HZ + scd->check_max = 1; +#endif } sched_clock_running = 1; } +#ifdef CONFIG_NO_HZ +/* + * The dynamic ticks makes the delta jiffies inaccurate. This + * prevents us from checking the maximum time update. + * Disable the maximum check during stopped ticks. + */ +void sched_clock_tick_stop(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 0; +} + +void sched_clock_tick_start(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 1; +} + +static int check_max(struct sched_clock_data *scd) +{ + return scd->check_max; +} +#else +static int check_max(struct sched_clock_data *scd) +{ + return 1; +} +#endif /* CONFIG_NO_HZ */ + /* * update the percpu scd from the raw @now value * @@ -112,7 +149,7 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) */ max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; - if (unlikely(clock + delta > max_clock)) { + if (unlikely(clock + delta > max_clock) && check_max(scd)) { if (clock < max_clock) clock = max_clock; else diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b854a895591..d63008b09a4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void) ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); + sched_clock_tick_stop(cpu); } /* @@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); + sched_clock_tick_start(cpu); cpu_clear(cpu, nohz_cpu_mask); /* -- cgit v1.2.3-70-g09d2 From 736603ab297506f4396cb5af592004499950fcfd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: jbd2: Add commit time into the commit block Carlo Wood has demonstrated that it's possible to recover deleted files from the journal. Something that will make this easier is if we can put the time of the commit into commit block. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 3 +++ include/linux/jbd2.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index a2ed72f7cee..92b6ac3df8a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -112,6 +112,7 @@ static int journal_submit_commit_record(journal_t *journal, struct buffer_head *bh; int ret; int barrier_done = 0; + struct timespec now = current_kernel_time(); if (is_journal_aborted(journal)) return 0; @@ -126,6 +127,8 @@ static int journal_submit_commit_record(journal_t *journal, tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + tmp->h_commit_sec = cpu_to_be64(now.tv_sec); + tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d147f0f9036..ec9cadf5822 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -168,6 +168,8 @@ struct commit_header { unsigned char h_chksum_size; unsigned char h_padding[2]; __be32 h_chksum[JBD2_CHECKSUM_BYTES]; + __be64 h_commit_sec; + __be32 h_commit_nsec; }; /* -- cgit v1.2.3-70-g09d2 From f4c0a0fdfae708f7aa438c27a380ed4071294e11 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: vfs: export filemap_fdatawrite_range() Make filemap_fdatawrite_range() function public, so that it can later be used in ordered mode rewrite by JBD/JBD2. Signed-off-by: Jan Kara --- include/linux/fs.h | 2 ++ mm/filemap.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d8e2762ed14..97f992adc62 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1740,6 +1740,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping, pgoff_t start, pgoff_t end); extern int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode); +extern int filemap_fdatawrite_range(struct address_space *mapping, + loff_t start, loff_t end); extern long do_fsync(struct file *file, int datasync); extern void sync_supers(void); diff --git a/mm/filemap.c b/mm/filemap.c index 1e6a7d34874..65d9d9e2b75 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawrite); -static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); /** * filemap_flush - mostly a non-blocking flush -- cgit v1.2.3-70-g09d2 From c851ed540173736e60d48b53b91a16ea5c903896 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: jbd2: Implement data=ordered mode handling via inodes This patch adds necessary framework into JBD2 to be able to track inodes with each transaction and write-out their dirty data during transaction commit time. This new ordered mode brings all sorts of advantages such as possibility to get rid of journal heads and buffer heads for data buffers in ordered mode, better ordering of writes on transaction commit, simplification of some JBD code, no more anonymous pages when truncate of data being committed happens. Also with this new ordered mode, delayed allocation on ordered mode is much simpler. Signed-off-by: Jan Kara --- fs/jbd2/commit.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/jbd2/journal.c | 52 +++++++++++++++++++++++++++++ fs/jbd2/transaction.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/jbd2.h | 42 ++++++++++++++++++++++++ 4 files changed, 270 insertions(+) (limited to 'include/linux') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 92b6ac3df8a..3ca107b5c86 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -355,6 +355,81 @@ write_out_data: journal_do_submit_data(wbuf, bufs); } +/* + * Submit all the data buffers of inode associated with the transaction to + * disk. + * + * We are in a committing transaction. Therefore no new inode can be added to + * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently + * operate on from being released while we write out pages. + */ +static int journal_submit_inode_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct jbd2_inode *jinode; + int err, ret = 0; + struct address_space *mapping; + + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + mapping = jinode->i_vfs_inode->i_mapping; + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + err = filemap_fdatawrite_range(mapping, 0, + i_size_read(jinode->i_vfs_inode)); + if (!ret) + ret = err; + spin_lock(&journal->j_list_lock); + J_ASSERT(jinode->i_transaction == commit_transaction); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } + spin_unlock(&journal->j_list_lock); + return ret; +} + +/* + * Wait for data submitted for writeout, refile inodes to proper + * transaction if needed. + * + */ +static int journal_finish_inode_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct jbd2_inode *jinode, *next_i; + int err, ret = 0; + + /* For locking, see the comment in journal_submit_inode_data_buffers() */ + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); + if (!ret) + ret = err; + spin_lock(&journal->j_list_lock); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } + + /* Now refile inode to proper lists */ + list_for_each_entry_safe(jinode, next_i, + &commit_transaction->t_inode_list, i_list) { + list_del(&jinode->i_list); + if (jinode->i_next_transaction) { + jinode->i_transaction = jinode->i_next_transaction; + jinode->i_next_transaction = NULL; + list_add(&jinode->i_list, + &jinode->i_transaction->t_inode_list); + } else { + jinode->i_transaction = NULL; + } + } + spin_unlock(&journal->j_list_lock); + + return ret; +} + static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) { struct page *page = bh->b_page; @@ -529,6 +604,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ err = 0; journal_submit_data_buffers(journal, commit_transaction); + err = journal_submit_inode_data_buffers(journal, commit_transaction); + if (err) + jbd2_journal_abort(journal, err); /* * Wait for all previously submitted IO to complete if commit @@ -760,6 +838,17 @@ start_journal_io: __jbd2_journal_abort_hard(journal); } + /* + * This is the right place to wait for data buffers both for ASYNC + * and !ASYNC commit. If commit is ASYNC, we need to wait only after + * the commit block went to disk (which happens above). If commit is + * SYNC, we need to wait for data buffers before we start writing + * commit block, which happens below in such setting. + */ + err = journal_finish_inode_data_buffers(journal, commit_transaction); + if (err) + jbd2_journal_abort(journal, err); + /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the @@ -880,6 +969,7 @@ wait_for_iobuf: jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_sync_datalist == NULL); + J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); J_ASSERT(commit_transaction->t_iobuf_list == NULL); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2e24567c4a7..78cf7bd7f60 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -82,6 +82,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); +EXPORT_SYMBOL(jbd2_journal_file_inode); +EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); +EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); @@ -2194,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) jbd_unlock_bh_journal_head(bh); } +/* + * Initialize jbd inode head + */ +void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) +{ + jinode->i_transaction = NULL; + jinode->i_next_transaction = NULL; + jinode->i_vfs_inode = inode; + jinode->i_flags = 0; + INIT_LIST_HEAD(&jinode->i_list); +} + +/* + * Function to be called before we start removing inode from memory (i.e., + * clear_inode() is a fine place to be called from). It removes inode from + * transaction's lists. + */ +void jbd2_journal_release_jbd_inode(journal_t *journal, + struct jbd2_inode *jinode) +{ + int writeout = 0; + + if (!journal) + return; +restart: + spin_lock(&journal->j_list_lock); + /* Is commit writing out inode - we have to wait */ + if (jinode->i_flags & JI_COMMIT_RUNNING) { + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); + wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&journal->j_list_lock); + schedule(); + finish_wait(wq, &wait.wait); + goto restart; + } + + /* Do we need to wait for data writeback? */ + if (journal->j_committing_transaction == jinode->i_transaction) + writeout = 1; + if (jinode->i_transaction) { + list_del(&jinode->i_list); + jinode->i_transaction = NULL; + } + spin_unlock(&journal->j_list_lock); +} + /* * debugfs tunables */ diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ba620c4493d..98b596d2370 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -51,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); + INIT_LIST_HEAD(&transaction->t_inode_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); @@ -2195,3 +2196,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) spin_unlock(&journal->j_list_lock); __brelse(bh); } + +/* + * File inode in the inode list of the handle's transaction + */ +int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + + if (is_handle_aborted(handle)) + return -EIO; + + jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, + transaction->t_tid); + + /* + * First check whether inode isn't already on the transaction's + * lists without taking the lock. Note that this check is safe + * without the lock as we cannot race with somebody removing inode + * from the transaction. The reason is that we remove inode from the + * transaction only in journal_release_jbd_inode() and when we commit + * the transaction. We are guarded from the first case by holding + * a reference to the inode. We are safe against the second case + * because if jinode->i_transaction == transaction, commit code + * cannot touch the transaction because we hold reference to it, + * and if jinode->i_next_transaction == transaction, commit code + * will only file the inode where we want it. + */ + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + return 0; + + spin_lock(&journal->j_list_lock); + + if (jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) + goto done; + + /* On some different transaction's list - should be + * the committing one */ + if (jinode->i_transaction) { + J_ASSERT(jinode->i_next_transaction == NULL); + J_ASSERT(jinode->i_transaction == + journal->j_committing_transaction); + jinode->i_next_transaction = transaction; + goto done; + } + /* Not on any transaction list... */ + J_ASSERT(!jinode->i_next_transaction); + jinode->i_transaction = transaction; + list_add(&jinode->i_list, &transaction->t_inode_list); +done: + spin_unlock(&journal->j_list_lock); + + return 0; +} + +/* + * This function must be called when inode is journaled in ordered mode + * before truncation happens. It starts writeout of truncated part in + * case it is in the committing transaction so that we stand to ordered + * mode consistency guarantees. + */ +int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, + loff_t new_size) +{ + journal_t *journal; + transaction_t *commit_trans; + int ret = 0; + + if (!inode->i_transaction && !inode->i_next_transaction) + goto out; + journal = inode->i_transaction->t_journal; + spin_lock(&journal->j_state_lock); + commit_trans = journal->j_committing_transaction; + spin_unlock(&journal->j_state_lock); + if (inode->i_transaction == commit_trans) { + ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, + new_size, LLONG_MAX); + if (ret) + jbd2_journal_abort(journal, ret); + } +out: + return ret; +} diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index ec9cadf5822..622c3d8ca4e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -381,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) bit_spin_unlock(BH_JournalHead, &bh->b_state); } +/* Flags in jbd_inode->i_flags */ +#define __JI_COMMIT_RUNNING 0 +/* Commit of the inode data in progress. We use this flag to protect us from + * concurrent deletion of inode. We cannot use reference to inode for this + * since we cannot afford doing last iput() on behalf of kjournald + */ +#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) + +/** + * struct jbd_inode is the structure linking inodes in ordered mode + * present in a transaction so that we can sync them during commit. + */ +struct jbd2_inode { + /* Which transaction does this inode belong to? Either the running + * transaction or the committing one. [j_list_lock] */ + transaction_t *i_transaction; + + /* Pointer to the running transaction modifying inode's data in case + * there is already a committing transaction touching it. [j_list_lock] */ + transaction_t *i_next_transaction; + + /* List of inodes in the i_transaction [j_list_lock] */ + struct list_head i_list; + + /* VFS inode this inode belongs to [constant during the lifetime + * of the structure] */ + struct inode *i_vfs_inode; + + /* Flags of inode [j_list_lock] */ + unsigned int i_flags; +}; + struct jbd2_revoke_table_s; /** @@ -566,6 +598,12 @@ struct transaction_s */ struct journal_head *t_log_list; + /* + * List of inodes whose data we've modified in data=ordered mode. + * [j_list_lock] + */ + struct list_head t_inode_list; + /* * Protects info related to handles */ @@ -1046,6 +1084,10 @@ extern void jbd2_journal_ack_err (journal_t *); extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); +extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); +extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size); +extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); +extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); /* * journal_head management -- cgit v1.2.3-70-g09d2 From 87c89c232c8f7b3820c33c3b9bc803e9358027da Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: jbd2: Remove data=ordered mode support using jbd buffer heads Signed-off-by: Jan Kara --- fs/jbd2/checkpoint.c | 1 - fs/jbd2/commit.c | 221 ++------------------------------------------------ fs/jbd2/journal.c | 1 - fs/jbd2/transaction.c | 217 ++----------------------------------------------- include/linux/jbd2.h | 29 ++----- 5 files changed, 21 insertions(+), 448 deletions(-) (limited to 'include/linux') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 6914598022c..91389c8aee8 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_state == T_FINISHED); J_ASSERT(transaction->t_buffers == NULL); - J_ASSERT(transaction->t_sync_datalist == NULL); J_ASSERT(transaction->t_forget == NULL); J_ASSERT(transaction->t_iobuf_list == NULL); J_ASSERT(transaction->t_shadow_list == NULL); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3ca107b5c86..483183d15ed 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -37,8 +37,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) } /* - * When an ext3-ordered file is truncated, it is possible that many pages are - * not sucessfully freed, because they are attached to a committing transaction. + * When an ext4 file is truncated, it is possible that some pages are not + * successfully freed, because they are attached to a committing transaction. * After the transaction commits, these pages are left on the LRU, with no * ->mapping, and with attached buffers. These pages are trivially reclaimable * by the VM, but their apparent absence upsets the VM accounting, and it makes @@ -79,21 +79,6 @@ nope: __brelse(bh); } -/* - * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is - * held. For ranking reasons we must trylock. If we lose, schedule away and - * return 0. j_list_lock is dropped in this case. - */ -static int inverted_lock(journal_t *journal, struct buffer_head *bh) -{ - if (!jbd_trylock_bh_state(bh)) { - spin_unlock(&journal->j_list_lock); - schedule(); - return 0; - } - return 1; -} - /* * Done it all: now submit the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort @@ -199,162 +184,6 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) return ret; } -/* - * Wait for all submitted IO to complete. - */ -static int journal_wait_on_locked_list(journal_t *journal, - transaction_t *commit_transaction) -{ - int ret = 0; - struct journal_head *jh; - - while (commit_transaction->t_locked_list) { - struct buffer_head *bh; - - jh = commit_transaction->t_locked_list->b_tprev; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - ret = -EIO; - spin_lock(&journal->j_list_lock); - } - if (!inverted_lock(journal, bh)) { - put_bh(bh); - spin_lock(&journal->j_list_lock); - continue; - } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } - put_bh(bh); - cond_resched_lock(&journal->j_list_lock); - } - return ret; - } - -static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) -{ - int i; - - for (i = 0; i < bufs; i++) { - wbuf[i]->b_end_io = end_buffer_write_sync; - /* We use-up our safety reference in submit_bh() */ - submit_bh(WRITE, wbuf[i]); - } -} - -/* - * Submit all the data buffers to disk - */ -static void journal_submit_data_buffers(journal_t *journal, - transaction_t *commit_transaction) -{ - struct journal_head *jh; - struct buffer_head *bh; - int locked; - int bufs = 0; - struct buffer_head **wbuf = journal->j_wbuf; - - /* - * Whenever we unlock the journal and sleep, things can get added - * onto ->t_sync_datalist, so we have to keep looping back to - * write_out_data until we *know* that the list is empty. - * - * Cleanup any flushed data buffers from the data list. Even in - * abort mode, we want to flush this out as soon as possible. - */ -write_out_data: - cond_resched(); - spin_lock(&journal->j_list_lock); - - while (commit_transaction->t_sync_datalist) { - jh = commit_transaction->t_sync_datalist; - bh = jh2bh(jh); - locked = 0; - - /* Get reference just to make sure buffer does not disappear - * when we are forced to drop various locks */ - get_bh(bh); - /* If the buffer is dirty, we need to submit IO and hence - * we need the buffer lock. We try to lock the buffer without - * blocking. If we fail, we need to drop j_list_lock and do - * blocking lock_buffer(). - */ - if (buffer_dirty(bh)) { - if (test_set_buffer_locked(bh)) { - BUFFER_TRACE(bh, "needs blocking lock"); - spin_unlock(&journal->j_list_lock); - /* Write out all data to prevent deadlocks */ - journal_do_submit_data(wbuf, bufs); - bufs = 0; - lock_buffer(bh); - spin_lock(&journal->j_list_lock); - } - locked = 1; - } - /* We have to get bh_state lock. Again out of order, sigh. */ - if (!inverted_lock(journal, bh)) { - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - } - /* Someone already cleaned up the buffer? */ - if (!buffer_jbd(bh) - || jh->b_transaction != commit_transaction - || jh->b_jlist != BJ_SyncData) { - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - BUFFER_TRACE(bh, "already cleaned up"); - put_bh(bh); - continue; - } - if (locked && test_clear_buffer_dirty(bh)) { - BUFFER_TRACE(bh, "needs writeout, adding to array"); - wbuf[bufs++] = bh; - __jbd2_journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - if (bufs == journal->j_wbufsize) { - spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); - bufs = 0; - goto write_out_data; - } - } else if (!locked && buffer_locked(bh)) { - __jbd2_journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - put_bh(bh); - } else { - BUFFER_TRACE(bh, "writeout complete: unfile"); - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - jbd2_journal_remove_journal_head(bh); - /* Once for our safety reference, once for - * jbd2_journal_remove_journal_head() */ - put_bh(bh); - put_bh(bh); - } - - if (need_resched() || spin_needbreak(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } - spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); -} - /* * Submit all the data buffers of inode associated with the transaction to * disk. @@ -602,24 +431,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = 0; - journal_submit_data_buffers(journal, commit_transaction); err = journal_submit_inode_data_buffers(journal, commit_transaction); - if (err) - jbd2_journal_abort(journal, err); - - /* - * Wait for all previously submitted IO to complete if commit - * record is to be written synchronously. - */ - spin_lock(&journal->j_list_lock); - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) - err = journal_wait_on_locked_list(journal, - commit_transaction); - - spin_unlock(&journal->j_list_lock); - if (err) jbd2_journal_abort(journal, err); @@ -627,16 +439,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(3, "JBD: commit phase 2\n"); - /* - * If we found any dirty or locked buffers, then we should have - * looped back up to the write_out_data label. If there weren't - * any then journal_clean_data_list should have wiped the list - * clean by now, so check that it is in fact empty. - */ - J_ASSERT (commit_transaction->t_sync_datalist == NULL); - - jbd_debug (3, "JBD: commit phase 3\n"); - /* * Way to go: we have now written out all of the data for a * transaction! Now comes the tricky part: we need to write out @@ -655,6 +457,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); + err = 0; descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { @@ -829,13 +632,6 @@ start_journal_io: &cbh, crc32_sum); if (err) __jbd2_journal_abort_hard(journal); - - spin_lock(&journal->j_list_lock); - err = journal_wait_on_locked_list(journal, - commit_transaction); - spin_unlock(&journal->j_list_lock); - if (err) - __jbd2_journal_abort_hard(journal); } /* @@ -860,7 +656,7 @@ start_journal_io: so we incur less scheduling load. */ - jbd_debug(3, "JBD: commit phase 4\n"); + jbd_debug(3, "JBD: commit phase 3\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. @@ -919,7 +715,7 @@ wait_for_iobuf: J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD: commit phase 5\n"); + jbd_debug(3, "JBD: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: @@ -946,7 +742,7 @@ wait_for_iobuf: /* AKPM: bforget here */ } - jbd_debug(3, "JBD: commit phase 6\n"); + jbd_debug(3, "JBD: commit phase 5\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { @@ -966,9 +762,8 @@ wait_for_iobuf: transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD: commit phase 7\n"); + jbd_debug(3, "JBD: commit phase 6\n"); - J_ASSERT(commit_transaction->t_sync_datalist == NULL); J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); @@ -1090,7 +885,7 @@ restart_loop: /* Done with this transaction! */ - jbd_debug(3, "JBD: commit phase 8\n"); + jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 78cf7bd7f60..b26c6d9fe6a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates); EXPORT_SYMBOL(jbd2_journal_get_write_access); EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); -EXPORT_SYMBOL(jbd2_journal_dirty_data); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); EXPORT_SYMBOL(jbd2_journal_release_buffer); EXPORT_SYMBOL(jbd2_journal_forget); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 98b596d2370..4f7cadbb19f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -942,183 +942,6 @@ out: return err; } -/** - * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which - * needs to be flushed before we can commit the - * current transaction. - * @handle: transaction - * @bh: bufferhead to mark - * - * The buffer is placed on the transaction's data list and is marked as - * belonging to the transaction. - * - * Returns error number or 0 on success. - * - * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage - * by kswapd. - */ -int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - journal_t *journal = handle->h_transaction->t_journal; - int need_brelse = 0; - struct journal_head *jh; - - if (is_handle_aborted(handle)) - return 0; - - jh = jbd2_journal_add_journal_head(bh); - JBUFFER_TRACE(jh, "entry"); - - /* - * The buffer could *already* be dirty. Writeout can start - * at any time. - */ - jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); - - /* - * What if the buffer is already part of a running transaction? - * - * There are two cases: - * 1) It is part of the current running transaction. Refile it, - * just in case we have allocated it as metadata, deallocated - * it, then reallocated it as data. - * 2) It is part of the previous, still-committing transaction. - * If all we want to do is to guarantee that the buffer will be - * written to disk before this new transaction commits, then - * being sure that the *previous* transaction has this same - * property is sufficient for us! Just leave it on its old - * transaction. - * - * In case (2), the buffer must not already exist as metadata - * --- that would violate write ordering (a transaction is free - * to write its data at any point, even before the previous - * committing transaction has committed). The caller must - * never, ever allow this to happen: there's nothing we can do - * about it in this layer. - */ - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - - /* Now that we have bh_state locked, are we really still mapped? */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); - goto no_journal; - } - - if (jh->b_transaction) { - JBUFFER_TRACE(jh, "has transaction"); - if (jh->b_transaction != handle->h_transaction) { - JBUFFER_TRACE(jh, "belongs to older transaction"); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); - - /* @@@ IS THIS TRUE ? */ - /* - * Not any more. Scenario: someone does a write() - * in data=journal mode. The buffer's transaction has - * moved into commit. Then someone does another - * write() to the file. We do the frozen data copyout - * and set b_next_transaction to point to j_running_t. - * And while we're in that state, someone does a - * writepage() in an attempt to pageout the same area - * of the file via a shared mapping. At present that - * calls jbd2_journal_dirty_data(), and we get right here. - * It may be too late to journal the data. Simply - * falling through to the next test will suffice: the - * data will be dirty and wil be checkpointed. The - * ordering comments in the next comment block still - * apply. - */ - //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - - /* - * If we're journalling data, and this buffer was - * subject to a write(), it could be metadata, forget - * or shadow against the committing transaction. Now, - * someone has dirtied the same darn page via a mapping - * and it is being writepage()'d. - * We *could* just steal the page from commit, with some - * fancy locking there. Instead, we just skip it - - * don't tie the page's buffers to the new transaction - * at all. - * Implication: if we crash before the writepage() data - * is written into the filesystem, recovery will replay - * the write() data. - */ - if (jh->b_jlist != BJ_None && - jh->b_jlist != BJ_SyncData && - jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "Not stealing"); - goto no_journal; - } - - /* - * This buffer may be undergoing writeout in commit. We - * can't return from here and let the caller dirty it - * again because that can cause the write-out loop in - * commit to never terminate. - */ - if (buffer_dirty(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - need_brelse = 1; - sync_dirty_buffer(bh); - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - /* Since we dropped the lock... */ - if (!buffer_mapped(bh)) { - JBUFFER_TRACE(jh, "buffer got unmapped"); - goto no_journal; - } - /* The buffer may become locked again at any - time if it is redirtied */ - } - - /* journal_clean_data_list() may have got there first */ - if (jh->b_transaction != NULL) { - JBUFFER_TRACE(jh, "unfile from commit"); - __jbd2_journal_temp_unlink_buffer(jh); - /* It still points to the committing - * transaction; move it to this one so - * that the refile assert checks are - * happy. */ - jh->b_transaction = handle->h_transaction; - } - /* The buffer will be refiled below */ - - } - /* - * Special case --- the buffer might actually have been - * allocated and then immediately deallocated in the previous, - * committing transaction, so might still be left on that - * transaction's metadata lists. - */ - if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { - JBUFFER_TRACE(jh, "not on correct data list: unfile"); - J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __jbd2_journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; - JBUFFER_TRACE(jh, "file as data"); - __jbd2_journal_file_buffer(jh, handle->h_transaction, - BJ_SyncData); - } - } else { - JBUFFER_TRACE(jh, "not on a transaction"); - __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); - } -no_journal: - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - if (need_brelse) { - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - } - JBUFFER_TRACE(jh, "exit"); - jbd2_journal_put_journal_head(jh); - return 0; -} - /** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. @@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of - * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, - * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller - * is holding onto a copy of one of thee pointers, it could go bad. - * Generally the caller needs to re-read the pointer from the transaction_t. + * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, + * t_log_list or t_reserved_list. If the caller is holding onto a copy of one + * of these pointers, it could go bad. Generally the caller needs to re-read + * the pointer from the transaction_t. * * Called under j_list_lock. The journal may not be locked. */ @@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) switch (jh->b_jlist) { case BJ_None: return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; case BJ_Metadata: transaction->t_nr_buffers--; J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); @@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) case BJ_Reserved: list = &transaction->t_reserved_list; break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; } __blist_del_buffer(list, jh); @@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) goto out; spin_lock(&journal->j_list_lock); - if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { - if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { - /* A written-back ordered data buffer */ - JBUFFER_TRACE(jh, "release data"); - __jbd2_journal_unfile_buffer(jh); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); - } - } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); @@ -1878,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!buffer_jbd(bh)) goto zap_buffer_unlocked; + /* OK, we have data buffer in journaled mode */ spin_lock(&journal->j_state_lock); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); @@ -1941,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } } else if (transaction == journal->j_committing_transaction) { JBUFFER_TRACE(jh, "on committing transaction"); - if (jh->b_jlist == BJ_Locked) { - /* - * The buffer is on the committing transaction's locked - * list. We have the buffer locked, so I/O has - * completed. So we can nail the buffer now. - */ - may_free = __dispose_buffer(jh, transaction); - goto zap_buffer; - } /* * If it is committing, we simply cannot touch it. We * can remove it's next_transaction pointer from the @@ -2082,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, J_ASSERT_JH(jh, !jh->b_committed_data); J_ASSERT_JH(jh, !jh->b_frozen_data); return; - case BJ_SyncData: - list = &transaction->t_sync_datalist; - break; case BJ_Metadata: transaction->t_nr_buffers++; list = &transaction->t_buffers; @@ -2104,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, case BJ_Reserved: list = &transaction->t_reserved_list; break; - case BJ_Locked: - list = &transaction->t_locked_list; - break; } __blist_add_buffer(list, jh); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 622c3d8ca4e..3dd20900709 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -542,24 +542,12 @@ struct transaction_s */ struct journal_head *t_reserved_list; - /* - * Doubly-linked circular list of all buffers under writeout during - * commit [j_list_lock] - */ - struct journal_head *t_locked_list; - /* * Doubly-linked circular list of all metadata buffers owned by this * transaction [j_list_lock] */ struct journal_head *t_buffers; - /* - * Doubly-linked circular list of all data buffers still to be - * flushed before this transaction can be committed [j_list_lock] - */ - struct journal_head *t_sync_datalist; - /* * Doubly-linked circular list of all forget buffers (superseded * buffers which we can un-checkpoint once this transaction commits) @@ -1044,7 +1032,6 @@ extern int jbd2_journal_extend (handle_t *, int nblocks); extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); -extern int jbd2_journal_dirty_data (handle_t *, struct buffer_head *); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); @@ -1223,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal) /* journaling buffer types */ #define BJ_None 0 /* Not journaled */ -#define BJ_SyncData 1 /* Normal data: flush before commit */ -#define BJ_Metadata 2 /* Normal journaled metadata */ -#define BJ_Forget 3 /* Buffer superseded by this transaction */ -#define BJ_IO 4 /* Buffer is for temporary IO use */ -#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */ -#define BJ_LogCtl 6 /* Buffer contains log descriptors */ -#define BJ_Reserved 7 /* Buffer is reserved for access by journal */ -#define BJ_Locked 8 /* Locked for I/O during commit */ -#define BJ_Types 9 +#define BJ_Metadata 1 /* Normal journaled metadata */ +#define BJ_Forget 2 /* Buffer superseded by this transaction */ +#define BJ_IO 3 /* Buffer is for temporary IO use */ +#define BJ_Shadow 4 /* Buffer contents being shadowed to the log */ +#define BJ_LogCtl 5 /* Buffer contains log descriptors */ +#define BJ_Reserved 6 /* Buffer is reserved for access by journal */ +#define BJ_Types 7 extern int jbd_blocks_per_page(struct inode *inode); -- cgit v1.2.3-70-g09d2 From 29a814d2ee0e43c2980f33f91c1311ec06c0aa35 Mon Sep 17 00:00:00 2001 From: Alex Tomas Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: vfs: add hooks for ext4's delayed allocation support Export mpage_bio_submit() and __mpage_writepage() for the benefit of ext4's delayed allocation support. Also change __block_write_full_page so that if buffers that have the BH_Delay flag set it will call get_block() to get the physical block allocated, just as in the !BH_Mapped case. Signed-off-by: Alex Tomas Signed-off-by: "Theodore Ts'o" --- fs/buffer.c | 7 +++++-- fs/mpage.c | 14 +++++--------- include/linux/mpage.h | 10 ++++++++++ 3 files changed, 20 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index f4b033237a0..5fa1512cd9a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { + } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && + buffer_dirty(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) goto recover; + clear_buffer_delay(bh); if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); @@ -1774,7 +1776,8 @@ recover: bh = head; /* Recovery: lock and submit the mapped buffers */ do { - if (buffer_mapped(bh) && buffer_dirty(bh)) { + if (buffer_mapped(bh) && buffer_dirty(bh) && + !buffer_delay(bh)) { lock_buffer(bh); mark_buffer_async_write(bh); } else { diff --git a/fs/mpage.c b/fs/mpage.c index 235e4d3873a..dbcc7af76a1 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err) bio_put(bio); } -static struct bio *mpage_bio_submit(int rw, struct bio *bio) +struct bio *mpage_bio_submit(int rw, struct bio *bio) { bio->bi_end_io = mpage_end_io_read; if (rw == WRITE) @@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio) submit_bio(rw, bio); return NULL; } +EXPORT_SYMBOL(mpage_bio_submit); static struct bio * mpage_alloc(struct block_device *bdev, @@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage); * written, so it can intelligently allocate a suitably-sized BIO. For now, * just allocate full-size (16-page) BIOs. */ -struct mpage_data { - struct bio *bio; - sector_t last_block_in_bio; - get_block_t *get_block; - unsigned use_writepage; -}; -static int __mpage_writepage(struct page *page, struct writeback_control *wbc, - void *data) +int __mpage_writepage(struct page *page, struct writeback_control *wbc, + void *data) { struct mpage_data *mpd = data; struct bio *bio = mpd->bio; @@ -651,6 +646,7 @@ out: mpd->bio = bio; return ret; } +EXPORT_SYMBOL(__mpage_writepage); /** * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 068a0c9946a..5c42821da2d 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -11,11 +11,21 @@ */ #ifdef CONFIG_BLOCK +struct mpage_data { + struct bio *bio; + sector_t last_block_in_bio; + get_block_t *get_block; + unsigned use_writepage; +}; + struct writeback_control; +struct bio *mpage_bio_submit(int rw, struct bio *bio); int mpage_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages, get_block_t get_block); int mpage_readpage(struct page *page, get_block_t get_block); +int __mpage_writepage(struct page *page, struct writeback_control *wbc, + void *data); int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block); int mpage_writepage(struct page *page, get_block_t *get_block, -- cgit v1.2.3-70-g09d2 From e8ced39d5e8911c662d4d69a342b9d053eaaac4e Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: percpu_counter: new function percpu_counter_sum_and_set Delayed allocation need to check free blocks at every write time. percpu_counter_read_positive() is not quit accurate. delayed allocation need a more accurate accounting, but using percpu_counter_sum_positive() is frequently is quite expensive. This patch added a new function to update center counter when sum per-cpu counter, to increase the accurate rate for next percpu_counter_read() and require less calling expensive percpu_counter_sum(). Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 2 +- include/linux/percpu_counter.h | 12 +++++++++--- lib/percpu_counter.c | 7 ++++++- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 25f63d8c1b3..6369bacf0dc 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -1621,7 +1621,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, #ifdef CONFIG_SMP if (free_blocks - root_blocks < FBC_BATCH) free_blocks = - percpu_counter_sum_positive(&sbi->s_freeblocks_counter); + percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); #endif if (free_blocks - root_blocks < nblocks) return free_blocks - root_blocks; diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 9007ccdfc11..20838883535 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount); void percpu_counter_destroy(struct percpu_counter *fbc); void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); -s64 __percpu_counter_sum(struct percpu_counter *fbc); +s64 __percpu_counter_sum(struct percpu_counter *fbc, int set); static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { @@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { - s64 ret = __percpu_counter_sum(fbc); + s64 ret = __percpu_counter_sum(fbc, 0); return ret < 0 ? 0 : ret; } +static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc) +{ + return __percpu_counter_sum(fbc, 1); +} + + static inline s64 percpu_counter_sum(struct percpu_counter *fbc) { - return __percpu_counter_sum(fbc); + return __percpu_counter_sum(fbc, 0); } static inline s64 percpu_counter_read(struct percpu_counter *fbc) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 119174494cb..4a8ba4bf5f6 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add); * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive() */ -s64 __percpu_counter_sum(struct percpu_counter *fbc) +s64 __percpu_counter_sum(struct percpu_counter *fbc, int set) { s64 ret; int cpu; @@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; + if (set) + *pcount = 0; } + if (set) + fbc->count = ret; + spin_unlock(&fbc->lock); return ret; } -- cgit v1.2.3-70-g09d2 From 06d6cf6959d22037fcec598f4f954db5db3d7356 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: mm: Add range_cont mode for writeback Filesystems like ext4 needs to start a new transaction in the writepages for block allocation. This happens with delayed allocation and there is limit to how many credits we can request from the journal layer. So we call write_cache_pages multiple times with wbc->nr_to_write set to the maximum possible value limitted by the max journal credits available. Add a new mode to writeback that enables us to handle this behaviour. In the new mode we update the wbc->range_start to point to the new offset to be written. Next call to call to write_cache_pages will start writeout from specified range_start offset. In the new mode we also limit writing to the specified wbc->range_end. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Acked-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- include/linux/writeback.h | 1 + mm/page-writeback.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f462439cc28..0d8573e6b9e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -63,6 +63,7 @@ struct writeback_control { unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ + unsigned range_cont:1; }; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 789b6adbef3..ded57d52806 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -956,6 +956,9 @@ retry: } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; + + if (wbc->range_cont) + wbc->range_start = index << PAGE_CACHE_SHIFT; return ret; } EXPORT_SYMBOL(write_cache_pages); -- cgit v1.2.3-70-g09d2 From 006ebb40d3d65338bd74abb03b945f8d60e362bd Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Mon, 19 May 2008 08:32:49 -0400 Subject: Security: split proc ptrace checking into read vs. attach Enable security modules to distinguish reading of process state via proc from full ptrace access by renaming ptrace_may_attach to ptrace_may_access and adding a mode argument indicating whether only read access or full attach access is requested. This allows security modules to permit access to reading process state without granting full ptrace access. The base DAC/capability checking remains unchanged. Read access to /proc/pid/mem continues to apply a full ptrace attach check since check_mem_permission() already requires the current task to already be ptracing the target. The other ptrace checks within proc for elements like environ, maps, and fds are changed to pass the read mode instead of attach. In the SELinux case, we model such reading of process state as a reading of a proc file labeled with the target process' label. This enables SELinux policy to permit such reading of process state without permitting control or manipulation of the target process, as there are a number of cases where programs probe for such information via proc but do not need to be able to control the target (e.g. procps, lsof, PolicyKit, ConsoleKit). At present we have to choose between allowing full ptrace in policy (more permissive than required/desired) or breaking functionality (or in some cases just silencing the denials via dontaudit rules but this can hide genuine attacks). This version of the patch incorporates comments from Casey Schaufler (change/replace existing ptrace_may_attach interface, pass access mode), and Chris Wright (provide greater consistency in the checking). Note that like their predecessors __ptrace_may_attach and ptrace_may_attach, the __ptrace_may_access and ptrace_may_access interfaces use different return value conventions from each other (0 or -errno vs. 1 or 0). I retained this difference to avoid any changes to the caller logic but made the difference clearer by changing the latter interface to return a bool rather than an int and by adding a comment about it to ptrace.h for any future callers. Signed-off-by: Stephen Smalley Acked-by: Chris Wright Signed-off-by: James Morris --- fs/proc/base.c | 9 +++++---- fs/proc/task_mmu.c | 6 +++--- fs/proc/task_nommu.c | 2 +- include/linux/ptrace.h | 8 ++++++-- include/linux/security.h | 16 +++++++++++----- kernel/ptrace.c | 15 ++++++++------- security/commoncap.c | 3 ++- security/dummy.c | 3 ++- security/security.c | 5 +++-- security/selinux/hooks.c | 13 +++++++++++-- security/smack/smack_lsm.c | 5 +++-- 11 files changed, 55 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/base.c b/fs/proc/base.c index 3b455371e7f..58c3e6a8e15 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -233,7 +233,7 @@ static int check_mem_permission(struct task_struct *task) */ if (task->parent == current && (task->ptrace & PT_PTRACED) && task_is_stopped_or_traced(task) && - ptrace_may_attach(task)) + ptrace_may_access(task, PTRACE_MODE_ATTACH)) return 0; /* @@ -251,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task) task_lock(task); if (task->mm != mm) goto out; - if (task->mm != current->mm && __ptrace_may_attach(task) < 0) + if (task->mm != current->mm && + __ptrace_may_access(task, PTRACE_MODE_READ) < 0) goto out; task_unlock(task); return mm; @@ -518,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode) */ task = get_proc_task(inode); if (task) { - allowed = ptrace_may_attach(task); + allowed = ptrace_may_access(task, PTRACE_MODE_READ); put_task_struct(task); } return allowed; @@ -904,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!task) goto out_no_task; - if (!ptrace_may_attach(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out; ret = -ENOMEM; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c492449f3b4..164bd9f9ede 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v) dev_t dev = 0; int len; - if (maps_protect && !ptrace_may_attach(task)) + if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) return -EACCES; if (file) { @@ -646,7 +646,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, goto out; ret = -EACCES; - if (!ptrace_may_attach(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_task; ret = -EINVAL; @@ -747,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v) struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; - if (maps_protect && !ptrace_may_attach(task)) + if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) return -EACCES; return show_numa_map(m, v); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 4b4f9cc2f18..5d84e7121df 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml) struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; - if (maps_protect && !ptrace_may_attach(task)) + if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) return -EACCES; return nommu_vma_show(m, vml->vma); diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index f98501ba557..c6f5f9dd0ce 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -95,8 +95,12 @@ extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); extern void ptrace_untrace(struct task_struct *child); -extern int ptrace_may_attach(struct task_struct *task); -extern int __ptrace_may_attach(struct task_struct *task); +#define PTRACE_MODE_READ 1 +#define PTRACE_MODE_ATTACH 2 +/* Returns 0 on success, -errno on denial. */ +extern int __ptrace_may_access(struct task_struct *task, unsigned int mode); +/* Returns true on success, false on denial. */ +extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) { diff --git a/include/linux/security.h b/include/linux/security.h index 50737c70e78..62bd80cb7f8 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -46,7 +46,8 @@ struct audit_krule; */ extern int cap_capable(struct task_struct *tsk, int cap); extern int cap_settime(struct timespec *ts, struct timezone *tz); -extern int cap_ptrace(struct task_struct *parent, struct task_struct *child); +extern int cap_ptrace(struct task_struct *parent, struct task_struct *child, + unsigned int mode); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern int cap_capset_check(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern void cap_capset_set(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); @@ -1170,6 +1171,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * attributes would be changed by the execve. * @parent contains the task_struct structure for parent process. * @child contains the task_struct structure for child process. + * @mode contains the PTRACE_MODE flags indicating the form of access. * Return 0 if permission is granted. * @capget: * Get the @effective, @inheritable, and @permitted capability sets for @@ -1295,7 +1297,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) struct security_operations { char name[SECURITY_NAME_MAX + 1]; - int (*ptrace) (struct task_struct *parent, struct task_struct *child); + int (*ptrace) (struct task_struct *parent, struct task_struct *child, + unsigned int mode); int (*capget) (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); @@ -1573,7 +1576,8 @@ extern struct dentry *securityfs_create_dir(const char *name, struct dentry *par extern void securityfs_remove(struct dentry *dentry); /* Security operations */ -int security_ptrace(struct task_struct *parent, struct task_struct *child); +int security_ptrace(struct task_struct *parent, struct task_struct *child, + unsigned int mode); int security_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, @@ -1755,9 +1759,11 @@ static inline int security_init(void) return 0; } -static inline int security_ptrace(struct task_struct *parent, struct task_struct *child) +static inline int security_ptrace(struct task_struct *parent, + struct task_struct *child, + unsigned int mode) { - return cap_ptrace(parent, child); + return cap_ptrace(parent, child, mode); } static inline int security_capget(struct task_struct *target, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6c19e94fd0a..e337390fce0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -121,7 +121,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) return ret; } -int __ptrace_may_attach(struct task_struct *task) +int __ptrace_may_access(struct task_struct *task, unsigned int mode) { /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -148,16 +148,16 @@ int __ptrace_may_attach(struct task_struct *task) if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - return security_ptrace(current, task); + return security_ptrace(current, task, mode); } -int ptrace_may_attach(struct task_struct *task) +bool ptrace_may_access(struct task_struct *task, unsigned int mode) { int err; task_lock(task); - err = __ptrace_may_attach(task); + err = __ptrace_may_access(task, mode); task_unlock(task); - return !err; + return (!err ? true : false); } int ptrace_attach(struct task_struct *task) @@ -195,7 +195,7 @@ repeat: /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; - retval = __ptrace_may_attach(task); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); if (retval) goto bad; @@ -494,7 +494,8 @@ int ptrace_traceme(void) */ task_lock(current); if (!(current->ptrace & PT_PTRACED)) { - ret = security_ptrace(current->parent, current); + ret = security_ptrace(current->parent, current, + PTRACE_MODE_ATTACH); /* * Set the ptrace bit in the process ptrace flags. */ diff --git a/security/commoncap.c b/security/commoncap.c index 33d34330841..0b6537a3672 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -63,7 +63,8 @@ int cap_settime(struct timespec *ts, struct timezone *tz) return 0; } -int cap_ptrace (struct task_struct *parent, struct task_struct *child) +int cap_ptrace (struct task_struct *parent, struct task_struct *child, + unsigned int mode) { /* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */ if (!cap_issubset(child->cap_permitted, parent->cap_permitted) && diff --git a/security/dummy.c b/security/dummy.c index b8916883b77..1db712d99dc 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -30,7 +30,8 @@ #include #include -static int dummy_ptrace (struct task_struct *parent, struct task_struct *child) +static int dummy_ptrace (struct task_struct *parent, struct task_struct *child, + unsigned int mode) { return 0; } diff --git a/security/security.c b/security/security.c index 59838a99b80..c4507ce2a5a 100644 --- a/security/security.c +++ b/security/security.c @@ -161,9 +161,10 @@ int mod_reg_security(const char *name, struct security_operations *ops) /* Security operations */ -int security_ptrace(struct task_struct *parent, struct task_struct *child) +int security_ptrace(struct task_struct *parent, struct task_struct *child, + unsigned int mode) { - return security_ops->ptrace(parent, child); + return security_ops->ptrace(parent, child, mode); } int security_capget(struct task_struct *target, diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index eca70f42e67..4be156334b2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1686,14 +1686,23 @@ static inline u32 file_to_av(struct file *file) /* Hook functions begin here. */ -static int selinux_ptrace(struct task_struct *parent, struct task_struct *child) +static int selinux_ptrace(struct task_struct *parent, + struct task_struct *child, + unsigned int mode) { int rc; - rc = secondary_ops->ptrace(parent, child); + rc = secondary_ops->ptrace(parent, child, mode); if (rc) return rc; + if (mode == PTRACE_MODE_READ) { + struct task_security_struct *tsec = parent->security; + struct task_security_struct *csec = child->security; + return avc_has_perm(tsec->sid, csec->sid, + SECCLASS_FILE, FILE__READ, NULL); + } + return task_has_perm(parent, child, PROCESS__PTRACE); } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 4a09293efa0..3c7150b3493 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -95,11 +95,12 @@ struct inode_smack *new_inode_smack(char *smack) * * Do the capability checks, and require read and write. */ -static int smack_ptrace(struct task_struct *ptp, struct task_struct *ctp) +static int smack_ptrace(struct task_struct *ptp, struct task_struct *ctp, + unsigned int mode) { int rc; - rc = cap_ptrace(ptp, ctp); + rc = cap_ptrace(ptp, ctp, mode); if (rc != 0) return rc; -- cgit v1.2.3-70-g09d2 From 2069f457848f846cb31149c9aa29b330a6b66d1b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 4 Jul 2008 09:47:13 +1000 Subject: LSM/SELinux: show LSM mount options in /proc/mounts This patch causes SELinux mount options to show up in /proc/mounts. As with other code in the area seq_put errors are ignored. Other LSM's will not have their mount options displayed until they fill in their own security_sb_show_options() function. Signed-off-by: Eric Paris Signed-off-by: Miklos Szeredi Signed-off-by: James Morris --- fs/namespace.c | 14 +++++++++--- include/linux/security.h | 9 ++++++++ security/dummy.c | 6 ++++++ security/security.c | 5 +++++ security/selinux/hooks.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 85 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 4fc302c2a0e..4f6f7635b59 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -750,7 +750,7 @@ struct proc_fs_info { const char *str; }; -static void show_sb_opts(struct seq_file *m, struct super_block *sb) +static int show_sb_opts(struct seq_file *m, struct super_block *sb) { static const struct proc_fs_info fs_info[] = { { MS_SYNCHRONOUS, ",sync" }, @@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb) if (sb->s_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } + + return security_sb_show_options(m, sb); } static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) @@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v) seq_putc(m, ' '); show_type(m, mnt->mnt_sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); - show_sb_opts(m, mnt->mnt_sb); + err = show_sb_opts(m, mnt->mnt_sb); + if (err) + goto out; show_mnt_opts(m, mnt); if (mnt->mnt_sb->s_op->show_options) err = mnt->mnt_sb->s_op->show_options(m, mnt); seq_puts(m, " 0 0\n"); +out: return err; } @@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v) seq_putc(m, ' '); mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); - show_sb_opts(m, sb); + err = show_sb_opts(m, sb); + if (err) + goto out; if (sb->s_op->show_options) err = sb->s_op->show_options(m, mnt); seq_putc(m, '\n'); +out: return err; } diff --git a/include/linux/security.h b/include/linux/security.h index 62bd80cb7f8..c8ad8ec684b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -80,6 +80,7 @@ struct xfrm_selector; struct xfrm_policy; struct xfrm_state; struct xfrm_user_sec_ctx; +struct seq_file; extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); extern int cap_netlink_recv(struct sk_buff *skb, int cap); @@ -1331,6 +1332,7 @@ struct security_operations { void (*sb_free_security) (struct super_block *sb); int (*sb_copy_data) (char *orig, char *copy); int (*sb_kern_mount) (struct super_block *sb, void *data); + int (*sb_show_options) (struct seq_file *m, struct super_block *sb); int (*sb_statfs) (struct dentry *dentry); int (*sb_mount) (char *dev_name, struct path *path, char *type, unsigned long flags, void *data); @@ -1610,6 +1612,7 @@ int security_sb_alloc(struct super_block *sb); void security_sb_free(struct super_block *sb); int security_sb_copy_data(char *orig, char *copy); int security_sb_kern_mount(struct super_block *sb, void *data); +int security_sb_show_options(struct seq_file *m, struct super_block *sb); int security_sb_statfs(struct dentry *dentry); int security_sb_mount(char *dev_name, struct path *path, char *type, unsigned long flags, void *data); @@ -1887,6 +1890,12 @@ static inline int security_sb_kern_mount(struct super_block *sb, void *data) return 0; } +static inline int security_sb_show_options(struct seq_file *m, + struct super_block *sb) +{ + return 0; +} + static inline int security_sb_statfs(struct dentry *dentry) { return 0; diff --git a/security/dummy.c b/security/dummy.c index 1db712d99dc..c155f08e9dd 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -194,6 +194,11 @@ static int dummy_sb_kern_mount (struct super_block *sb, void *data) return 0; } +static int dummy_sb_show_options(struct seq_file *m, struct super_block *sb) +{ + return 0; +} + static int dummy_sb_statfs (struct dentry *dentry) { return 0; @@ -1088,6 +1093,7 @@ void security_fixup_ops (struct security_operations *ops) set_to_dummy_if_null(ops, sb_free_security); set_to_dummy_if_null(ops, sb_copy_data); set_to_dummy_if_null(ops, sb_kern_mount); + set_to_dummy_if_null(ops, sb_show_options); set_to_dummy_if_null(ops, sb_statfs); set_to_dummy_if_null(ops, sb_mount); set_to_dummy_if_null(ops, sb_check_sb); diff --git a/security/security.c b/security/security.c index 2c0a5876b93..de74fdccde2 100644 --- a/security/security.c +++ b/security/security.c @@ -292,6 +292,11 @@ int security_sb_kern_mount(struct super_block *sb, void *data) return security_ops->sb_kern_mount(sb, data); } +int security_sb_show_options(struct seq_file *m, struct super_block *sb) +{ + return security_ops->sb_show_options(m, sb); +} + int security_sb_statfs(struct dentry *dentry) { return security_ops->sb_statfs(dentry); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 85f74f66576..33dee83fdd2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -9,7 +9,8 @@ * James Morris * * Copyright (C) 2001,2002 Networks Associates Technology, Inc. - * Copyright (C) 2003 Red Hat, Inc., James Morris + * Copyright (C) 2003-2008 Red Hat, Inc., James Morris + * Eric Paris * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. @@ -970,6 +971,57 @@ out_err: return rc; } +void selinux_write_opts(struct seq_file *m, struct security_mnt_opts *opts) +{ + int i; + char *prefix; + + for (i = 0; i < opts->num_mnt_opts; i++) { + char *has_comma = strchr(opts->mnt_opts[i], ','); + + switch (opts->mnt_opts_flags[i]) { + case CONTEXT_MNT: + prefix = CONTEXT_STR; + break; + case FSCONTEXT_MNT: + prefix = FSCONTEXT_STR; + break; + case ROOTCONTEXT_MNT: + prefix = ROOTCONTEXT_STR; + break; + case DEFCONTEXT_MNT: + prefix = DEFCONTEXT_STR; + break; + default: + BUG(); + }; + /* we need a comma before each option */ + seq_putc(m, ','); + seq_puts(m, prefix); + if (has_comma) + seq_putc(m, '\"'); + seq_puts(m, opts->mnt_opts[i]); + if (has_comma) + seq_putc(m, '\"'); + } +} + +static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb) +{ + struct security_mnt_opts opts; + int rc; + + rc = selinux_get_mnt_opts(sb, &opts); + if (rc) + return rc; + + selinux_write_opts(m, &opts); + + security_free_mnt_opts(&opts); + + return rc; +} + static inline u16 inode_mode_to_security_class(umode_t mode) { switch (mode & S_IFMT) { @@ -5365,6 +5417,7 @@ static struct security_operations selinux_ops = { .sb_free_security = selinux_sb_free_security, .sb_copy_data = selinux_sb_copy_data, .sb_kern_mount = selinux_sb_kern_mount, + .sb_show_options = selinux_sb_show_options, .sb_statfs = selinux_sb_statfs, .sb_mount = selinux_mount, .sb_umount = selinux_umount, -- cgit v1.2.3-70-g09d2 From b478a9f9889c81e88077d1495daadee64c0af541 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 3 Jul 2008 20:56:04 +0200 Subject: security: remove unused sb_get_mnt_opts hook The sb_get_mnt_opts() hook is unused, and is superseded by the sb_show_options() hook. Signed-off-by: Miklos Szeredi Acked-by: James Morris --- include/linux/security.h | 14 -------------- security/dummy.c | 8 -------- security/security.c | 6 ------ security/selinux/hooks.c | 1 - 4 files changed, 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index c8ad8ec684b..43c6357568a 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -291,10 +291,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Update module state after a successful pivot. * @old_path contains the path for the old root. * @new_path contains the path for the new root. - * @sb_get_mnt_opts: - * Get the security relevant mount options used for a superblock - * @sb the superblock to get security mount options from - * @opts binary data structure containing all lsm mount data * @sb_set_mnt_opts: * Set the security relevant mount options used for a superblock * @sb the superblock to set security mount options for @@ -1348,8 +1344,6 @@ struct security_operations { struct path *new_path); void (*sb_post_pivotroot) (struct path *old_path, struct path *new_path); - int (*sb_get_mnt_opts) (const struct super_block *sb, - struct security_mnt_opts *opts); int (*sb_set_mnt_opts) (struct super_block *sb, struct security_mnt_opts *opts); void (*sb_clone_mnt_opts) (const struct super_block *oldsb, @@ -1624,8 +1618,6 @@ void security_sb_post_remount(struct vfsmount *mnt, unsigned long flags, void *d void security_sb_post_addmount(struct vfsmount *mnt, struct path *mountpoint); int security_sb_pivotroot(struct path *old_path, struct path *new_path); void security_sb_post_pivotroot(struct path *old_path, struct path *new_path); -int security_sb_get_mnt_opts(const struct super_block *sb, - struct security_mnt_opts *opts); int security_sb_set_mnt_opts(struct super_block *sb, struct security_mnt_opts *opts); void security_sb_clone_mnt_opts(const struct super_block *oldsb, struct super_block *newsb); @@ -1942,12 +1934,6 @@ static inline int security_sb_pivotroot(struct path *old_path, static inline void security_sb_post_pivotroot(struct path *old_path, struct path *new_path) { } -static inline int security_sb_get_mnt_opts(const struct super_block *sb, - struct security_mnt_opts *opts) -{ - security_init_mnt_opts(opts); - return 0; -} static inline int security_sb_set_mnt_opts(struct super_block *sb, struct security_mnt_opts *opts) diff --git a/security/dummy.c b/security/dummy.c index c155f08e9dd..79385669164 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -252,13 +252,6 @@ static void dummy_sb_post_pivotroot (struct path *old_path, struct path *new_pat return; } -static int dummy_sb_get_mnt_opts(const struct super_block *sb, - struct security_mnt_opts *opts) -{ - security_init_mnt_opts(opts); - return 0; -} - static int dummy_sb_set_mnt_opts(struct super_block *sb, struct security_mnt_opts *opts) { @@ -1104,7 +1097,6 @@ void security_fixup_ops (struct security_operations *ops) set_to_dummy_if_null(ops, sb_post_addmount); set_to_dummy_if_null(ops, sb_pivotroot); set_to_dummy_if_null(ops, sb_post_pivotroot); - set_to_dummy_if_null(ops, sb_get_mnt_opts); set_to_dummy_if_null(ops, sb_set_mnt_opts); set_to_dummy_if_null(ops, sb_clone_mnt_opts); set_to_dummy_if_null(ops, sb_parse_opts_str); diff --git a/security/security.c b/security/security.c index de74fdccde2..28b2860c112 100644 --- a/security/security.c +++ b/security/security.c @@ -348,12 +348,6 @@ void security_sb_post_pivotroot(struct path *old_path, struct path *new_path) security_ops->sb_post_pivotroot(old_path, new_path); } -int security_sb_get_mnt_opts(const struct super_block *sb, - struct security_mnt_opts *opts) -{ - return security_ops->sb_get_mnt_opts(sb, opts); -} - int security_sb_set_mnt_opts(struct super_block *sb, struct security_mnt_opts *opts) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 33dee83fdd2..745a69e74e3 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5421,7 +5421,6 @@ static struct security_operations selinux_ops = { .sb_statfs = selinux_sb_statfs, .sb_mount = selinux_mount, .sb_umount = selinux_umount, - .sb_get_mnt_opts = selinux_get_mnt_opts, .sb_set_mnt_opts = selinux_set_mnt_opts, .sb_clone_mnt_opts = selinux_sb_clone_mnt_opts, .sb_parse_opts_str = selinux_parse_opts_str, -- cgit v1.2.3-70-g09d2 From 6f0f0fd496333777d53daff21a4e3b28c4d03a6d Mon Sep 17 00:00:00 2001 From: James Morris Date: Thu, 10 Jul 2008 17:02:07 +0900 Subject: security: remove register_security hook The register security hook is no longer required, as the capability module is always registered. LSMs wishing to stack capability as a secondary module should do so explicitly. Signed-off-by: James Morris Acked-by: Stephen Smalley Acked-by: Greg Kroah-Hartman --- include/linux/security.h | 10 ---------- security/capability.c | 7 ------- security/root_plug.c | 9 --------- security/security.c | 29 ----------------------------- security/selinux/hooks.c | 32 +++++--------------------------- security/smack/smack_lsm.c | 23 ----------------------- 6 files changed, 5 insertions(+), 105 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 43c6357568a..31c8851ec5d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1239,11 +1239,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @pages contains the number of pages. * Return 0 if permission is granted. * - * @register_security: - * allow module stacking. - * @name contains the name of the security module being stacked. - * @ops contains a pointer to the struct security_operations of the module to stack. - * * @secid_to_secctx: * Convert secid to security context. * @secid contains the security ID. @@ -1471,10 +1466,6 @@ struct security_operations { int (*netlink_send) (struct sock *sk, struct sk_buff *skb); int (*netlink_recv) (struct sk_buff *skb, int cap); - /* allow module stacking */ - int (*register_security) (const char *name, - struct security_operations *ops); - void (*d_instantiate) (struct dentry *dentry, struct inode *inode); int (*getprocattr) (struct task_struct *p, char *name, char **value); @@ -1564,7 +1555,6 @@ struct security_operations { extern int security_init(void); extern int security_module_enable(struct security_operations *ops); extern int register_security(struct security_operations *ops); -extern int mod_reg_security(const char *name, struct security_operations *ops); extern struct dentry *securityfs_create_file(const char *name, mode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); diff --git a/security/capability.c b/security/capability.c index 6e0671c8201..5b01c0b0242 100644 --- a/security/capability.c +++ b/security/capability.c @@ -721,12 +721,6 @@ static int cap_xfrm_decode_session(struct sk_buff *skb, u32 *fl, int ckall) } #endif /* CONFIG_SECURITY_NETWORK_XFRM */ -static int cap_register_security(const char *name, - struct security_operations *ops) -{ - return -EINVAL; -} - static void cap_d_instantiate(struct dentry *dentry, struct inode *inode) { } @@ -940,7 +934,6 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, sem_semop); set_to_cap_if_null(ops, netlink_send); set_to_cap_if_null(ops, netlink_recv); - set_to_cap_if_null(ops, register_security); set_to_cap_if_null(ops, d_instantiate); set_to_cap_if_null(ops, getprocattr); set_to_cap_if_null(ops, setprocattr); diff --git a/security/root_plug.c b/security/root_plug.c index a41cf42a4fa..be0ebec2580 100644 --- a/security/root_plug.c +++ b/security/root_plug.c @@ -28,9 +28,6 @@ #include #include -/* flag to keep track of how we were registered */ -static int secondary; - /* default is a generic type of usb to serial converter */ static int vendor_id = 0x0557; static int product_id = 0x2008; @@ -97,13 +94,7 @@ static int __init rootplug_init (void) if (register_security (&rootplug_security_ops)) { printk (KERN_INFO "Failure registering Root Plug module with the kernel\n"); - /* try registering with primary module */ - if (mod_reg_security (MY_NAME, &rootplug_security_ops)) { - printk (KERN_INFO "Failure registering Root Plug " - " module with primary security module.\n"); return -EINVAL; - } - secondary = 1; } printk (KERN_INFO "Root Plug module initialized, " "vendor_id = %4.4x, product id = %4.4x\n", vendor_id, product_id); diff --git a/security/security.c b/security/security.c index 30b0278de39..59f23b5918b 100644 --- a/security/security.c +++ b/security/security.c @@ -125,35 +125,6 @@ int register_security(struct security_operations *ops) return 0; } -/** - * mod_reg_security - allows security modules to be "stacked" - * @name: a pointer to a string with the name of the security_options to be registered - * @ops: a pointer to the struct security_options that is to be registered - * - * This function allows security modules to be stacked if the currently loaded - * security module allows this to happen. It passes the @name and @ops to the - * register_security function of the currently loaded security module. - * - * The return value depends on the currently loaded security module, with 0 as - * success. - */ -int mod_reg_security(const char *name, struct security_operations *ops) -{ - if (verify(ops)) { - printk(KERN_INFO "%s could not verify " - "security operations.\n", __func__); - return -EINVAL; - } - - if (ops == security_ops) { - printk(KERN_INFO "%s security operations " - "already registered.\n", __func__); - return -EINVAL; - } - - return security_ops->register_security(name, ops); -} - /* Security operations */ int security_ptrace(struct task_struct *parent, struct task_struct *child, diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 745a69e74e3..91200feb3f9 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -126,13 +126,11 @@ __setup("selinux=", selinux_enabled_setup); int selinux_enabled = 1; #endif -/* Original (dummy) security module. */ -static struct security_operations *original_ops; -/* Minimal support for a secondary security module, - just to allow the use of the dummy or capability modules. - The owlsm module can alternatively be used as a secondary - module as long as CONFIG_OWLSM_FD is not enabled. */ +/* + * Minimal support for a secondary security module, + * just to allow the use of the capability module. + */ static struct security_operations *secondary_ops; /* Lists of inode and superblock security structures initialized @@ -5115,24 +5113,6 @@ static void selinux_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid) *secid = isec->sid; } -/* module stacking operations */ -static int selinux_register_security(const char *name, struct security_operations *ops) -{ - if (secondary_ops != original_ops) { - printk(KERN_ERR "%s: There is already a secondary security " - "module registered.\n", __func__); - return -EINVAL; - } - - secondary_ops = ops; - - printk(KERN_INFO "%s: Registering secondary module %s\n", - __func__, - name); - - return 0; -} - static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode) { if (inode) @@ -5517,8 +5497,6 @@ static struct security_operations selinux_ops = { .sem_semctl = selinux_sem_semctl, .sem_semop = selinux_sem_semop, - .register_security = selinux_register_security, - .d_instantiate = selinux_d_instantiate, .getprocattr = selinux_getprocattr, @@ -5612,7 +5590,7 @@ static __init int selinux_init(void) 0, SLAB_PANIC, NULL); avc_init(); - original_ops = secondary_ops = security_ops; + secondary_ops = security_ops; if (!secondary_ops) panic("SELinux: No initial security operations\n"); if (register_security(&selinux_ops)) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 3c7150b3493..ee5a51cbc5e 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1822,27 +1822,6 @@ static void smack_ipc_getsecid(struct kern_ipc_perm *ipp, u32 *secid) *secid = smack_to_secid(smack); } -/* module stacking operations */ - -/** - * smack_register_security - stack capability module - * @name: module name - * @ops: module operations - ignored - * - * Allow the capability module to register. - */ -static int smack_register_security(const char *name, - struct security_operations *ops) -{ - if (strcmp(name, "capability") != 0) - return -EINVAL; - - printk(KERN_INFO "%s: Registering secondary module %s\n", - __func__, name); - - return 0; -} - /** * smack_d_instantiate - Make sure the blob is correct on an inode * @opt_dentry: unused @@ -2673,8 +2652,6 @@ struct security_operations smack_ops = { .netlink_send = cap_netlink_send, .netlink_recv = cap_netlink_recv, - .register_security = smack_register_security, - .d_instantiate = smack_d_instantiate, .getprocattr = smack_getprocattr, -- cgit v1.2.3-70-g09d2 From 7e9db9eaefdb8798730790214ff1b7746006ec98 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 14 Jul 2008 09:58:44 +0200 Subject: [S390] cio: Introduce modalias for css bus. Add modalias and subchannel type attributes for all subchannels. I/O subchannel specific attributes are now created in io_subchannel_probe(). modalias and subchannel type are also added to the uevent for the css bus. Also make the css modalias known. Signed-off-by: Cornelia Huck Signed-off-by: Martin Schwidefsky Signed-off-by: Heiko Carstens --- Documentation/ABI/testing/sysfs-bus-css | 35 +++++++++++++++++ drivers/s390/cio/cio.h | 1 + drivers/s390/cio/css.c | 69 ++++++++++++++++++++++++++++++--- drivers/s390/cio/css.h | 2 - drivers/s390/cio/device.c | 47 ++++++++++++++-------- include/linux/mod_devicetable.h | 9 +++++ scripts/mod/file2alias.c | 12 ++++++ 7 files changed, 151 insertions(+), 24 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-bus-css (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-css b/Documentation/ABI/testing/sysfs-bus-css new file mode 100644 index 00000000000..b585ec258a0 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-css @@ -0,0 +1,35 @@ +What: /sys/bus/css/devices/.../type +Date: March 2008 +Contact: Cornelia Huck + linux-s390@vger.kernel.org +Description: Contains the subchannel type, as reported by the hardware. + This attribute is present for all subchannel types. + +What: /sys/bus/css/devices/.../modalias +Date: March 2008 +Contact: Cornelia Huck + linux-s390@vger.kernel.org +Description: Contains the module alias as reported with uevents. + It is of the format css:t and present for all + subchannel types. + +What: /sys/bus/css/drivers/io_subchannel/.../chpids +Date: December 2002 +Contact: Cornelia Huck + linux-s390@vger.kernel.org +Description: Contains the ids of the channel paths used by this + subchannel, as reported by the channel subsystem + during subchannel recognition. + Note: This is an I/O-subchannel specific attribute. +Users: s390-tools, HAL + +What: /sys/bus/css/drivers/io_subchannel/.../pimpampom +Date: December 2002 +Contact: Cornelia Huck + linux-s390@vger.kernel.org +Description: Contains the PIM/PAM/POM values, as reported by the + channel subsystem when last queried by the common I/O + layer (this implies that this attribute is not neccessarily + in sync with the values current in the channel subsystem). + Note: This is an I/O-subchannel specific attribute. +Users: s390-tools, HAL diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h index 6e933aebe01..4062748e834 100644 --- a/drivers/s390/cio/cio.h +++ b/drivers/s390/cio/cio.h @@ -3,6 +3,7 @@ #include #include +#include #include #include "chsc.h" #include "schid.h" diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index b7f4b52c5a9..53e7496dc90 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -2,8 +2,7 @@ * drivers/s390/cio/css.c * driver for channel subsystem * - * Copyright (C) 2002 IBM Deutschland Entwicklung GmbH, - * IBM Corporation + * Copyright IBM Corp. 2002,2008 * Author(s): Arnd Bergmann (arndb@de.ibm.com) * Cornelia Huck (cornelia.huck@de.ibm.com) */ @@ -210,6 +209,41 @@ void css_update_ssd_info(struct subchannel *sch) } } +static ssize_t type_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct subchannel *sch = to_subchannel(dev); + + return sprintf(buf, "%01x\n", sch->st); +} + +static DEVICE_ATTR(type, 0444, type_show, NULL); + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct subchannel *sch = to_subchannel(dev); + + return sprintf(buf, "css:t%01X\n", sch->st); +} + +static DEVICE_ATTR(modalias, 0444, modalias_show, NULL); + +static struct attribute *subch_attrs[] = { + &dev_attr_type.attr, + &dev_attr_modalias.attr, + NULL, +}; + +static struct attribute_group subch_attr_group = { + .attrs = subch_attrs, +}; + +static struct attribute_group *default_subch_attr_groups[] = { + &subch_attr_group, + NULL, +}; + static int css_register_subchannel(struct subchannel *sch) { int ret; @@ -218,16 +252,17 @@ static int css_register_subchannel(struct subchannel *sch) sch->dev.parent = &channel_subsystems[0]->device; sch->dev.bus = &css_bus_type; sch->dev.release = &css_subchannel_release; - sch->dev.groups = subch_attr_groups; + sch->dev.groups = default_subch_attr_groups; /* * We don't want to generate uevents for I/O subchannels that don't * have a working ccw device behind them since they will be * unregistered before they can be used anyway, so we delay the add * uevent until after device recognition was successful. + * Note that we suppress the uevent for all subchannel types; + * the subchannel driver can decide itself when it wants to inform + * userspace of its existence. */ - if (!cio_is_console(sch->schid)) - /* Console is special, no need to suppress. */ - sch->dev.uevent_suppress = 1; + sch->dev.uevent_suppress = 1; css_update_ssd_info(sch); /* make it known to the system */ ret = css_sch_device_register(sch); @@ -236,6 +271,15 @@ static int css_register_subchannel(struct subchannel *sch) sch->schid.ssid, sch->schid.sch_no, ret); return ret; } + if (!sch->driver) { + /* + * No driver matched. Generate the uevent now so that + * a fitting driver module may be loaded based on the + * modalias. + */ + sch->dev.uevent_suppress = 0; + kobject_uevent(&sch->dev.kobj, KOBJ_ADD); + } return ret; } @@ -926,12 +970,25 @@ static void css_shutdown(struct device *dev) sch->driver->shutdown(sch); } +static int css_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct subchannel *sch = to_subchannel(dev); + int ret; + + ret = add_uevent_var(env, "ST=%01X", sch->st); + if (ret) + return ret; + ret = add_uevent_var(env, "MODALIAS=css:t%01X", sch->st); + return ret; +} + struct bus_type css_bus_type = { .name = "css", .match = css_bus_match, .probe = css_probe, .remove = css_remove, .shutdown = css_shutdown, + .uevent = css_uevent, }; /** diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h index bfe0ada43f2..e0fc7b49978 100644 --- a/drivers/s390/cio/css.h +++ b/drivers/s390/cio/css.h @@ -143,6 +143,4 @@ int css_sch_is_valid(struct schib *); extern struct workqueue_struct *slow_path_wq; void css_wait_for_slow_path(void); - -extern struct attribute_group *subch_attr_groups[]; #endif diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 0ed5a81260b..23b129fd4d8 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -585,19 +585,14 @@ static DEVICE_ATTR(modalias, 0444, modalias_show, NULL); static DEVICE_ATTR(online, 0644, online_show, online_store); static DEVICE_ATTR(availability, 0444, available_show, NULL); -static struct attribute * subch_attrs[] = { +static struct attribute *io_subchannel_attrs[] = { &dev_attr_chpids.attr, &dev_attr_pimpampom.attr, NULL, }; -static struct attribute_group subch_attr_group = { - .attrs = subch_attrs, -}; - -struct attribute_group *subch_attr_groups[] = { - &subch_attr_group, - NULL, +static struct attribute_group io_subchannel_attr_group = { + .attrs = io_subchannel_attrs, }; static struct attribute * ccwdev_attrs[] = { @@ -1157,11 +1152,21 @@ static int io_subchannel_probe(struct subchannel *sch) cdev = sch_get_cdev(sch); if (cdev) { + rc = sysfs_create_group(&sch->dev.kobj, + &io_subchannel_attr_group); + if (rc) + CIO_MSG_EVENT(0, "Failed to create io subchannel " + "attributes for subchannel " + "0.%x.%04x (rc=%d)\n", + sch->schid.ssid, sch->schid.sch_no, rc); /* * This subchannel already has an associated ccw_device. - * Register it and exit. This happens for all early - * device, e.g. the console. + * Throw the delayed uevent for the subchannel, register + * the ccw_device and exit. This happens for all early + * devices, e.g. the console. */ + sch->dev.uevent_suppress = 0; + kobject_uevent(&sch->dev.kobj, KOBJ_ADD); cdev->dev.groups = ccwdev_attr_groups; device_initialize(&cdev->dev); ccw_device_register(cdev); @@ -1184,11 +1189,17 @@ static int io_subchannel_probe(struct subchannel *sch) */ dev_id.devno = sch->schib.pmcw.dev; dev_id.ssid = sch->schid.ssid; + rc = sysfs_create_group(&sch->dev.kobj, + &io_subchannel_attr_group); + if (rc) + return rc; /* Allocate I/O subchannel private data. */ sch->private = kzalloc(sizeof(struct io_subchannel_private), GFP_KERNEL | GFP_DMA); - if (!sch->private) - return -ENOMEM; + if (!sch->private) { + rc = -ENOMEM; + goto out_err; + } cdev = get_disc_ccwdev_by_dev_id(&dev_id, NULL); if (!cdev) cdev = get_orphaned_ccwdev_by_dev_id(to_css(sch->dev.parent), @@ -1207,8 +1218,8 @@ static int io_subchannel_probe(struct subchannel *sch) } cdev = io_subchannel_create_ccwdev(sch); if (IS_ERR(cdev)) { - kfree(sch->private); - return PTR_ERR(cdev); + rc = PTR_ERR(cdev); + goto out_err; } rc = io_subchannel_recog(cdev, sch); if (rc) { @@ -1217,9 +1228,12 @@ static int io_subchannel_probe(struct subchannel *sch) spin_unlock_irqrestore(sch->lock, flags); if (cdev->dev.release) cdev->dev.release(&cdev->dev); - kfree(sch->private); + goto out_err; } - + return 0; +out_err: + kfree(sch->private); + sysfs_remove_group(&sch->dev.kobj, &io_subchannel_attr_group); return rc; } @@ -1240,6 +1254,7 @@ io_subchannel_remove (struct subchannel *sch) ccw_device_unregister(cdev); put_device(&cdev->dev); kfree(sch->private); + sysfs_remove_group(&sch->dev.kobj, &io_subchannel_attr_group); return 0; } diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 69b2342d5eb..1fd03e732e0 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -159,6 +159,15 @@ struct ap_device_id { #define AP_DEVICE_ID_MATCH_DEVICE_TYPE 0x01 +/* s390 css bus devices (subchannels) */ +struct css_device_id { + __u8 type; /* subchannel type */ + __u8 pad1; + __u16 pad2; + __u32 pad3; + kernel_ulong_t driver_data; +}; + #define ACPI_ID_LEN 16 /* only 9 bytes needed here, 16 bytes are used */ /* to workaround crosscompile issues */ diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index cea4a790e1e..37d5c363fbc 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -304,6 +304,14 @@ static int do_ap_entry(const char *filename, return 1; } +/* looks like: "css:tN" */ +static int do_css_entry(const char *filename, + struct css_device_id *id, char *alias) +{ + sprintf(alias, "css:t%01X", id->type); + return 1; +} + /* Looks like: "serio:tyNprNidNexN" */ static int do_serio_entry(const char *filename, struct serio_device_id *id, char *alias) @@ -680,6 +688,10 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, do_table(symval, sym->st_size, sizeof(struct ap_device_id), "ap", do_ap_entry, mod); + else if (sym_is(symname, "__mod_css_device_table")) + do_table(symval, sym->st_size, + sizeof(struct css_device_id), "css", + do_css_entry, mod); else if (sym_is(symname, "__mod_serio_device_table")) do_table(symval, sym->st_size, sizeof(struct serio_device_id), "serio", -- cgit v1.2.3-70-g09d2 From f08adc008d84f6b03d377ede951e29ed169e76e2 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 14 Jul 2008 09:59:03 +0200 Subject: [S390] css: Use css_device_id for bus matching. css_device_id exists, so use it for determining the right driver (and add a match_flags which is always 1 for valid types). Signed-off-by: Cornelia Huck Signed-off-by: Martin Schwidefsky Signed-off-by: Heiko Carstens --- drivers/s390/cio/css.c | 15 ++++++--------- drivers/s390/cio/css.h | 2 +- drivers/s390/cio/device.c | 8 +++++++- include/linux/mod_devicetable.h | 2 +- 4 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 45ba07c0a28..4e2f2bbf4ba 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -850,19 +850,16 @@ int sch_is_pseudo_sch(struct subchannel *sch) return sch == to_css(sch->dev.parent)->pseudo_subchannel; } -/* - * find a driver for a subchannel. They identify by the subchannel - * type with the exception that the console subchannel driver has its own - * subchannel type although the device is an i/o subchannel - */ -static int -css_bus_match (struct device *dev, struct device_driver *drv) +static int css_bus_match(struct device *dev, struct device_driver *drv) { struct subchannel *sch = to_subchannel(dev); struct css_driver *driver = to_cssdriver(drv); + struct css_device_id *id; - if (sch->st == driver->subchannel_type) - return 1; + for (id = driver->subchannel_type; id->match_flags; id++) { + if (sch->st == id->type) + return 1; + } return 0; } diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h index 38bf9ddb841..58020bf41ed 100644 --- a/drivers/s390/cio/css.h +++ b/drivers/s390/cio/css.h @@ -75,7 +75,7 @@ struct chp_link; */ struct css_driver { struct module *owner; - unsigned int subchannel_type; + struct css_device_id *subchannel_type; struct device_driver drv; void (*irq)(struct subchannel *); int (*chp_event)(struct subchannel *, struct chp_link *, int); diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 522d47afc95..c904cb84d75 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -131,9 +131,15 @@ static int io_subchannel_sch_event(struct subchannel *, int); static int io_subchannel_chp_event(struct subchannel *, struct chp_link *, int); +static struct css_device_id io_subchannel_ids[] = { + { .match_flags = 0x1, .type = SUBCHANNEL_TYPE_IO, }, + { /* end of list */ }, +}; +MODULE_DEVICE_TABLE(css, io_subchannel_ids); + static struct css_driver io_subchannel_driver = { .owner = THIS_MODULE, - .subchannel_type = SUBCHANNEL_TYPE_IO, + .subchannel_type = io_subchannel_ids, .name = "io_subchannel", .irq = io_subchannel_irq, .sch_event = io_subchannel_sch_event, diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 1fd03e732e0..c4db5827963 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -161,8 +161,8 @@ struct ap_device_id { /* s390 css bus devices (subchannels) */ struct css_device_id { + __u8 match_flags; __u8 type; /* subchannel type */ - __u8 pad1; __u16 pad2; __u32 pad3; kernel_ulong_t driver_data; -- cgit v1.2.3-70-g09d2 From 341c2c958ec7bdd9f54733a8b0b432fe76842a82 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 20 May 2008 02:17:51 +0900 Subject: libata: consistently use msecs for time durations libata has been using mix of jiffies and msecs for time druations. This is getting confusing. As writing sub HZ values in jiffies is PITA and msecs_to_jiffies() can't be used as initializer, unify unit for all time durations to msecs. So, durations are in msecs and deadlines are in jiffies. ata_deadline() is added to compute deadline from a start time and duration in msecs. While at it, drop now superflous _msec suffix from arguments and rename @timeout to @deadline if it represents a fixed point in time rather than duration. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 44 +++++++++++++++++++++----------------------- drivers/ata/libata-eh.c | 33 +++++++++++++++++---------------- drivers/ata/libata-pmp.c | 3 ++- drivers/ata/libata-sff.c | 15 ++++++++------- drivers/ata/pata_bf54x.c | 6 +++--- drivers/ata/pata_scc.c | 2 +- include/linux/libata.h | 26 ++++++++++++++++---------- 7 files changed, 68 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 303fc0d2b97..c5c3b1b516e 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include @@ -145,7 +144,7 @@ static int libata_dma_mask = ATA_DMA_MASK_ATA|ATA_DMA_MASK_ATAPI|ATA_DMA_MASK_CF module_param_named(dma, libata_dma_mask, int, 0444); MODULE_PARM_DESC(dma, "DMA enable/disable (0x1==ATA, 0x2==ATAPI, 0x4==CF)"); -static int ata_probe_timeout = ATA_TMOUT_INTERNAL / HZ; +static int ata_probe_timeout = ATA_TMOUT_INTERNAL / 1000; module_param(ata_probe_timeout, int, 0444); MODULE_PARM_DESC(ata_probe_timeout, "Set ATA probing timeout (seconds)"); @@ -1533,7 +1532,7 @@ unsigned long ata_id_xfermask(const u16 *id) * @ap: The ata_port to queue port_task for * @fn: workqueue function to be scheduled * @data: data for @fn to use - * @delay: delay time for workqueue function + * @delay: delay time in msecs for workqueue function * * Schedule @fn(@data) for execution after @delay jiffies using * port_task. There is one port_task per port and it's the @@ -1552,7 +1551,7 @@ void ata_pio_queue_task(struct ata_port *ap, void *data, unsigned long delay) ap->port_task_data = data; /* may fail if ata_port_flush_task() in progress */ - queue_delayed_work(ata_wq, &ap->port_task, delay); + queue_delayed_work(ata_wq, &ap->port_task, msecs_to_jiffies(delay)); } /** @@ -1685,7 +1684,7 @@ unsigned ata_exec_internal_sg(struct ata_device *dev, spin_unlock_irqrestore(ap->lock, flags); if (!timeout) - timeout = ata_probe_timeout * 1000 / HZ; + timeout = ata_probe_timeout * 1000; rc = wait_for_completion_timeout(&wait, msecs_to_jiffies(timeout)); @@ -3319,7 +3318,7 @@ int ata_wait_ready(struct ata_link *link, unsigned long deadline, int (*check_ready)(struct ata_link *link)) { unsigned long start = jiffies; - unsigned long nodev_deadline = start + ATA_TMOUT_FF_WAIT; + unsigned long nodev_deadline = ata_deadline(start, ATA_TMOUT_FF_WAIT); int warned = 0; if (time_after(nodev_deadline, deadline)) @@ -3387,7 +3386,7 @@ int ata_wait_ready(struct ata_link *link, unsigned long deadline, int ata_wait_after_reset(struct ata_link *link, unsigned long deadline, int (*check_ready)(struct ata_link *link)) { - msleep(ATA_WAIT_AFTER_RESET_MSECS); + msleep(ATA_WAIT_AFTER_RESET); return ata_wait_ready(link, deadline, check_ready); } @@ -3417,13 +3416,13 @@ int ata_wait_after_reset(struct ata_link *link, unsigned long deadline, int sata_link_debounce(struct ata_link *link, const unsigned long *params, unsigned long deadline) { - unsigned long interval_msec = params[0]; - unsigned long duration = msecs_to_jiffies(params[1]); + unsigned long interval = params[0]; + unsigned long duration = params[1]; unsigned long last_jiffies, t; u32 last, cur; int rc; - t = jiffies + msecs_to_jiffies(params[2]); + t = ata_deadline(jiffies, params[2]); if (time_before(t, deadline)) deadline = t; @@ -3435,7 +3434,7 @@ int sata_link_debounce(struct ata_link *link, const unsigned long *params, last_jiffies = jiffies; while (1) { - msleep(interval_msec); + msleep(interval); if ((rc = sata_scr_read(link, SCR_STATUS, &cur))) return rc; cur &= 0xf; @@ -3444,7 +3443,8 @@ int sata_link_debounce(struct ata_link *link, const unsigned long *params, if (cur == last) { if (cur == 1 && time_before(jiffies, deadline)) continue; - if (time_after(jiffies, last_jiffies + duration)) + if (time_after(jiffies, + ata_deadline(last_jiffies, duration))) return 0; continue; } @@ -3636,7 +3636,8 @@ int sata_link_hardreset(struct ata_link *link, const unsigned long *timing, if (check_ready) { unsigned long pmp_deadline; - pmp_deadline = jiffies + ATA_TMOUT_PMP_SRST_WAIT; + pmp_deadline = ata_deadline(jiffies, + ATA_TMOUT_PMP_SRST_WAIT); if (time_after(pmp_deadline, deadline)) pmp_deadline = deadline; ata_wait_ready(link, pmp_deadline, check_ready); @@ -6073,8 +6074,6 @@ static void __init ata_parse_force_param(void) static int __init ata_init(void) { - ata_probe_timeout *= HZ; - ata_parse_force_param(); ata_wq = create_workqueue("ata"); @@ -6127,8 +6126,8 @@ int ata_ratelimit(void) * @reg: IO-mapped register * @mask: Mask to apply to read register value * @val: Wait condition - * @interval_msec: polling interval in milliseconds - * @timeout_msec: timeout in milliseconds + * @interval: polling interval in milliseconds + * @timeout: timeout in milliseconds * * Waiting for some bits of register to change is a common * operation for ATA controllers. This function reads 32bit LE @@ -6146,10 +6145,9 @@ int ata_ratelimit(void) * The final register value. */ u32 ata_wait_register(void __iomem *reg, u32 mask, u32 val, - unsigned long interval_msec, - unsigned long timeout_msec) + unsigned long interval, unsigned long timeout) { - unsigned long timeout; + unsigned long deadline; u32 tmp; tmp = ioread32(reg); @@ -6158,10 +6156,10 @@ u32 ata_wait_register(void __iomem *reg, u32 mask, u32 val, * preceding writes reach the controller before starting to * eat away the timeout. */ - timeout = jiffies + (timeout_msec * HZ) / 1000; + deadline = ata_deadline(jiffies, timeout); - while ((tmp & mask) == val && time_before(jiffies, timeout)) { - msleep(interval_msec); + while ((tmp & mask) == val && time_before(jiffies, deadline)) { + msleep(interval); tmp = ioread32(reg); } diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 7894d83ea1e..08dd07f1000 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -66,15 +66,14 @@ enum { ATA_ECAT_DUBIOUS_TOUT_HSM = 6, ATA_ECAT_DUBIOUS_UNK_DEV = 7, ATA_ECAT_NR = 8, -}; -/* Waiting in ->prereset can never be reliable. It's sometimes nice - * to wait there but it can't be depended upon; otherwise, we wouldn't - * be resetting. Just give it enough time for most drives to spin up. - */ -enum { - ATA_EH_PRERESET_TIMEOUT = 10 * HZ, - ATA_EH_FASTDRAIN_INTERVAL = 3 * HZ, + /* Waiting in ->prereset can never be reliable. It's + * sometimes nice to wait there but it can't be depended upon; + * otherwise, we wouldn't be resetting. Just give it enough + * time for most drives to spin up. + */ + ATA_EH_PRERESET_TIMEOUT = 10000, + ATA_EH_FASTDRAIN_INTERVAL = 3000, }; /* The following table determines how we sequence resets. Each entry @@ -84,10 +83,10 @@ enum { * are mostly for error handling, hotplug and retarded devices. */ static const unsigned long ata_eh_reset_timeouts[] = { - 10 * HZ, /* most drives spin up by 10sec */ - 10 * HZ, /* > 99% working drives spin up before 20sec */ - 35 * HZ, /* give > 30 secs of idleness for retarded devices */ - 5 * HZ, /* and sweet one last chance */ + 10000, /* most drives spin up by 10sec */ + 10000, /* > 99% working drives spin up before 20sec */ + 35000, /* give > 30 secs of idleness for retarded devices */ + 5000, /* and sweet one last chance */ /* > 1 min has elapsed, give up */ }; @@ -641,7 +640,7 @@ void ata_eh_fastdrain_timerfn(unsigned long arg) /* some qcs have finished, give it another chance */ ap->fastdrain_cnt = cnt; ap->fastdrain_timer.expires = - jiffies + ATA_EH_FASTDRAIN_INTERVAL; + ata_deadline(jiffies, ATA_EH_FASTDRAIN_INTERVAL); add_timer(&ap->fastdrain_timer); } @@ -681,7 +680,8 @@ static void ata_eh_set_pending(struct ata_port *ap, int fastdrain) /* activate fast drain */ ap->fastdrain_cnt = cnt; - ap->fastdrain_timer.expires = jiffies + ATA_EH_FASTDRAIN_INTERVAL; + ap->fastdrain_timer.expires = + ata_deadline(jiffies, ATA_EH_FASTDRAIN_INTERVAL); add_timer(&ap->fastdrain_timer); } @@ -2125,7 +2125,8 @@ int ata_eh_reset(struct ata_link *link, int classify, } if (prereset) { - rc = prereset(link, jiffies + ATA_EH_PRERESET_TIMEOUT); + rc = prereset(link, + ata_deadline(jiffies, ATA_EH_PRERESET_TIMEOUT)); if (rc) { if (rc == -ENOENT) { ata_link_printk(link, KERN_DEBUG, @@ -2160,7 +2161,7 @@ int ata_eh_reset(struct ata_link *link, int classify, if (ata_is_host_link(link)) ata_eh_freeze_port(ap); - deadline = jiffies + ata_eh_reset_timeouts[try++]; + deadline = ata_deadline(jiffies, ata_eh_reset_timeouts[try++]); if (reset) { if (verbose) diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c index 7daf4c0f621..63691d77ac4 100644 --- a/drivers/ata/libata-pmp.c +++ b/drivers/ata/libata-pmp.c @@ -785,7 +785,8 @@ static int sata_pmp_eh_handle_disabled_links(struct ata_port *ap) * SError.N working. */ sata_link_hardreset(link, sata_deb_timing_normal, - jiffies + ATA_TMOUT_INTERNAL_QUICK, NULL, NULL); + ata_deadline(jiffies, ATA_TMOUT_INTERNAL_QUICK), + NULL, NULL); /* unconditionally clear SError.N */ rc = sata_scr_write(link, SCR_ERROR, SERR_PHYRDY_CHG); diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index c0908c22548..304fdc6f1dc 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -345,8 +345,8 @@ void ata_sff_dma_pause(struct ata_port *ap) /** * ata_sff_busy_sleep - sleep until BSY clears, or timeout * @ap: port containing status register to be polled - * @tmout_pat: impatience timeout - * @tmout: overall timeout + * @tmout_pat: impatience timeout in msecs + * @tmout: overall timeout in msecs * * Sleep until ATA Status register bit BSY clears, * or a timeout occurs. @@ -365,7 +365,7 @@ int ata_sff_busy_sleep(struct ata_port *ap, status = ata_sff_busy_wait(ap, ATA_BUSY, 300); timer_start = jiffies; - timeout = timer_start + tmout_pat; + timeout = ata_deadline(timer_start, tmout_pat); while (status != 0xff && (status & ATA_BUSY) && time_before(jiffies, timeout)) { msleep(50); @@ -377,7 +377,7 @@ int ata_sff_busy_sleep(struct ata_port *ap, "port is slow to respond, please be patient " "(Status 0x%x)\n", status); - timeout = timer_start + tmout; + timeout = ata_deadline(timer_start, tmout); while (status != 0xff && (status & ATA_BUSY) && time_before(jiffies, timeout)) { msleep(50); @@ -390,7 +390,7 @@ int ata_sff_busy_sleep(struct ata_port *ap, if (status & ATA_BUSY) { ata_port_printk(ap, KERN_ERR, "port failed to respond " "(%lu secs, Status 0x%x)\n", - tmout / HZ, status); + DIV_ROUND_UP(tmout, 1000), status); return -EBUSY; } @@ -1888,7 +1888,7 @@ int ata_sff_wait_after_reset(struct ata_link *link, unsigned int devmask, unsigned int dev1 = devmask & (1 << 1); int rc, ret = 0; - msleep(ATA_WAIT_AFTER_RESET_MSECS); + msleep(ATA_WAIT_AFTER_RESET); /* always check readiness of the master device */ rc = ata_sff_wait_ready(link, deadline); @@ -2371,7 +2371,8 @@ void ata_bus_reset(struct ata_port *ap) /* issue bus reset */ if (ap->flags & ATA_FLAG_SRST) { - rc = ata_bus_softreset(ap, devmask, jiffies + 40 * HZ); + rc = ata_bus_softreset(ap, devmask, + ata_deadline(jiffies, 40000)); if (rc && rc != -ENODEV) goto err_out; } diff --git a/drivers/ata/pata_bf54x.c b/drivers/ata/pata_bf54x.c index 55516103626..d3932901a3b 100644 --- a/drivers/ata/pata_bf54x.c +++ b/drivers/ata/pata_bf54x.c @@ -1011,7 +1011,7 @@ static void bfin_bus_post_reset(struct ata_port *ap, unsigned int devmask) void __iomem *base = (void __iomem *)ap->ioaddr.ctl_addr; unsigned int dev0 = devmask & (1 << 0); unsigned int dev1 = devmask & (1 << 1); - unsigned long timeout; + unsigned long deadline; /* if device 0 was found in ata_devchk, wait for its * BSY bit to clear @@ -1022,7 +1022,7 @@ static void bfin_bus_post_reset(struct ata_port *ap, unsigned int devmask) /* if device 1 was found in ata_devchk, wait for * register access, then wait for BSY to clear */ - timeout = jiffies + ATA_TMOUT_BOOT; + deadline = ata_deadline(jiffies, ATA_TMOUT_BOOT); while (dev1) { u8 nsect, lbal; @@ -1031,7 +1031,7 @@ static void bfin_bus_post_reset(struct ata_port *ap, unsigned int devmask) lbal = read_atapi_register(base, ATA_REG_LBAL); if ((nsect == 1) && (lbal == 1)) break; - if (time_after(jiffies, timeout)) { + if (time_after(jiffies, deadline)) { dev1 = 0; break; } diff --git a/drivers/ata/pata_scc.c b/drivers/ata/pata_scc.c index bbf5aa345e6..16673d16857 100644 --- a/drivers/ata/pata_scc.c +++ b/drivers/ata/pata_scc.c @@ -696,7 +696,7 @@ static void scc_bmdma_stop (struct ata_queued_cmd *qc) if (reg & INTSTS_BMSINT) { unsigned int classes; - unsigned long deadline = jiffies + ATA_TMOUT_BOOT; + unsigned long deadline = ata_deadline(jiffies, ATA_TMOUT_BOOT); printk(KERN_WARNING "%s: Internal Bus Error\n", DRV_NAME); out_be32(bmid_base + SCC_DMA_INTST, INTSTS_BMSINT); /* TBD: SW reset */ diff --git a/include/linux/libata.h b/include/linux/libata.h index e57e5d08312..94110b652b3 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -27,6 +27,7 @@ #define __LINUX_LIBATA_H__ #include +#include #include #include #include @@ -115,7 +116,7 @@ enum { /* tag ATA_MAX_QUEUE - 1 is reserved for internal commands */ ATA_MAX_QUEUE = 32, ATA_TAG_INTERNAL = ATA_MAX_QUEUE - 1, - ATA_SHORT_PAUSE = (HZ >> 6) + 1, + ATA_SHORT_PAUSE = 16, ATAPI_MAX_DRAIN = 16 << 10, @@ -234,17 +235,17 @@ enum { /* bits 24:31 of host->flags are reserved for LLD specific flags */ /* various lengths of time */ - ATA_TMOUT_BOOT = 30 * HZ, /* heuristic */ - ATA_TMOUT_BOOT_QUICK = 7 * HZ, /* heuristic */ - ATA_TMOUT_INTERNAL = 30 * HZ, - ATA_TMOUT_INTERNAL_QUICK = 5 * HZ, + ATA_TMOUT_BOOT = 30000, /* heuristic */ + ATA_TMOUT_BOOT_QUICK = 7000, /* heuristic */ + ATA_TMOUT_INTERNAL = 30000, + ATA_TMOUT_INTERNAL_QUICK = 5000, /* FIXME: GoVault needs 2s but we can't afford that without * parallel probing. 800ms is enough for iVDR disk * HHD424020F7SV00. Increase to 2secs when parallel probing * is in place. */ - ATA_TMOUT_FF_WAIT = 4 * HZ / 5, + ATA_TMOUT_FF_WAIT = 800, /* Spec mandates to wait for ">= 2ms" before checking status * after reset. We wait 150ms, because that was the magic @@ -256,14 +257,14 @@ enum { * * Old drivers/ide uses the 2mS rule and then waits for ready. */ - ATA_WAIT_AFTER_RESET_MSECS = 150, + ATA_WAIT_AFTER_RESET = 150, /* If PMP is supported, we have to do follow-up SRST. As some * PMPs don't send D2H Reg FIS after hardreset, LLDs are * advised to wait only for the following duration before * doing SRST. */ - ATA_TMOUT_PMP_SRST_WAIT = 1 * HZ, + ATA_TMOUT_PMP_SRST_WAIT = 1000, /* ATA bus states */ BUS_UNKNOWN = 0, @@ -895,8 +896,7 @@ extern void ata_host_resume(struct ata_host *host); #endif extern int ata_ratelimit(void); extern u32 ata_wait_register(void __iomem *reg, u32 mask, u32 val, - unsigned long interval_msec, - unsigned long timeout_msec); + unsigned long interval, unsigned long timeout); extern int atapi_cmd_type(u8 opcode); extern void ata_tf_to_fis(const struct ata_taskfile *tf, u8 pmp, int is_cmd, u8 *fis); @@ -1389,6 +1389,12 @@ static inline int ata_check_ready(u8 status) return 0; } +static inline unsigned long ata_deadline(unsigned long from_jiffies, + unsigned long timeout_msecs) +{ + return from_jiffies + msecs_to_jiffies(timeout_msecs); +} + /************************************************************************** * PMP - drivers/ata/libata-pmp.c -- cgit v1.2.3-70-g09d2 From 0a2c0f56159999e20015241d3b8fa89b1ab14309 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 20 May 2008 02:17:52 +0900 Subject: libata: improve EH retry delay handling EH retries were delayed by 5 seconds to ensure that resets don't occur back-to-back. However, this 5 second delay is superflous or excessive in many cases. For example, after IDENTIFY times out, there's no reason to wait five more seconds before retrying. This patch adds ehc->last_reset timestamp and record the timestamp for the last reset trial or success and uses it to space resets by ATA_EH_RESET_COOL_DOWN which is 5 secs and removes unconditional 5 sec sleeps. As this change makes inter-try waits often shorter and they're redundant in nature, this patch also removes the "retrying..." messages. While at it, convert explicit rounding up division to DIV_ROUND_UP(). This change speeds up EH in many cases w/o sacrificing robustness. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- drivers/ata/libata-eh.c | 38 ++++++++++++++++++++------------------ drivers/ata/libata-pmp.c | 10 ---------- include/linux/libata.h | 2 ++ 3 files changed, 22 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 08dd07f1000..5b5ae631ed0 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -67,6 +67,9 @@ enum { ATA_ECAT_DUBIOUS_UNK_DEV = 7, ATA_ECAT_NR = 8, + /* always put at least this amount of time between resets */ + ATA_EH_RESET_COOL_DOWN = 5000, + /* Waiting in ->prereset can never be reliable. It's * sometimes nice to wait there but it can't be depended upon; * otherwise, we wouldn't be resetting. Just give it enough @@ -485,6 +488,9 @@ void ata_scsi_error(struct Scsi_Host *host) if (ata_ncq_enabled(dev)) ehc->saved_ncq_enabled |= 1 << devno; } + + /* set last reset timestamp to some time in the past */ + ehc->last_reset = jiffies - 60 * HZ; } ap->pflags |= ATA_PFLAG_EH_IN_PROGRESS; @@ -2088,11 +2094,17 @@ int ata_eh_reset(struct ata_link *link, int classify, /* * Prepare to reset */ + now = jiffies; + deadline = ata_deadline(ehc->last_reset, ATA_EH_RESET_COOL_DOWN); + if (time_before(now, deadline)) + schedule_timeout_uninterruptible(deadline - now); + spin_lock_irqsave(ap->lock, flags); ap->pflags |= ATA_PFLAG_RESETTING; spin_unlock_irqrestore(ap->lock, flags); ata_eh_about_to_do(link, NULL, ATA_EH_RESET); + ehc->last_reset = jiffies; ata_link_for_each_dev(dev, link) { /* If we issue an SRST then an ATA drive (not ATAPI) @@ -2158,6 +2170,7 @@ int ata_eh_reset(struct ata_link *link, int classify, /* * Perform reset */ + ehc->last_reset = jiffies; if (ata_is_host_link(link)) ata_eh_freeze_port(ap); @@ -2278,6 +2291,7 @@ int ata_eh_reset(struct ata_link *link, int classify, /* reset successful, schedule revalidation */ ata_eh_done(link, NULL, ATA_EH_RESET); + ehc->last_reset = jiffies; ehc->i.action |= ATA_EH_REVALIDATE; rc = 0; @@ -2304,9 +2318,9 @@ int ata_eh_reset(struct ata_link *link, int classify, if (time_before(now, deadline)) { unsigned long delta = deadline - now; - ata_link_printk(link, KERN_WARNING, "reset failed " - "(errno=%d), retrying in %u secs\n", - rc, (jiffies_to_msecs(delta) + 999) / 1000); + ata_link_printk(link, KERN_WARNING, + "reset failed (errno=%d), retrying in %u secs\n", + rc, DIV_ROUND_UP(jiffies_to_msecs(delta), 1000)); while (delta) delta = schedule_timeout_uninterruptible(delta); @@ -2623,7 +2637,7 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset, { struct ata_link *link; struct ata_device *dev; - int nr_failed_devs, nr_disabled_devs; + int nr_failed_devs; int rc; unsigned long flags; @@ -2666,7 +2680,6 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset, retry: rc = 0; nr_failed_devs = 0; - nr_disabled_devs = 0; /* if UNLOADING, finish immediately */ if (ap->pflags & ATA_PFLAG_UNLOADING) @@ -2733,8 +2746,7 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset, dev_fail: nr_failed_devs++; - if (ata_eh_handle_dev_fail(dev, rc)) - nr_disabled_devs++; + ata_eh_handle_dev_fail(dev, rc); if (ap->pflags & ATA_PFLAG_FROZEN) { /* PMP reset requires working host port. @@ -2746,18 +2758,8 @@ dev_fail: } } - if (nr_failed_devs) { - if (nr_failed_devs != nr_disabled_devs) { - ata_port_printk(ap, KERN_WARNING, "failed to recover " - "some devices, retrying in 5 secs\n"); - ssleep(5); - } else { - /* no device left to recover, repeat fast */ - msleep(500); - } - + if (nr_failed_devs) goto retry; - } out: if (rc && r_failed_link) diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c index 63691d77ac4..b65db309c18 100644 --- a/drivers/ata/libata-pmp.c +++ b/drivers/ata/libata-pmp.c @@ -727,19 +727,12 @@ static int sata_pmp_eh_recover_pmp(struct ata_port *ap, } if (tries) { - int sleep = ehc->i.flags & ATA_EHI_DID_RESET; - /* consecutive revalidation failures? speed down */ if (reval_failed) sata_down_spd_limit(link); else reval_failed = 1; - ata_dev_printk(dev, KERN_WARNING, - "retrying reset%s\n", - sleep ? " in 5 secs" : ""); - if (sleep) - ssleep(5); ehc->i.action |= ATA_EH_RESET; goto retry; } else { @@ -991,10 +984,7 @@ static int sata_pmp_eh_recover(struct ata_port *ap) goto retry; if (--pmp_tries) { - ata_port_printk(ap, KERN_WARNING, - "failed to recover PMP, retrying in 5 secs\n"); pmp_ehc->i.action |= ATA_EH_RESET; - ssleep(5); goto retry; } diff --git a/include/linux/libata.h b/include/linux/libata.h index 94110b652b3..9058c2a325a 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -602,6 +602,8 @@ struct ata_eh_context { unsigned int did_probe_mask; unsigned int saved_ncq_enabled; u8 saved_xfer_mode[ATA_MAX_DEVICES]; + /* timestamp for the last reset attempt or success */ + unsigned long last_reset; }; struct ata_acpi_drive -- cgit v1.2.3-70-g09d2 From 87fbc5a060faf2394bee88a93519f9b9d434727c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 20 May 2008 02:17:54 +0900 Subject: libata: improve EH internal command timeout handling ATA_TMOUT_INTERNAL which was 30secs were used for all internal commands which is way too long when something goes wrong. This patch implements command type based stepped timeouts. Different command types can use different timeouts and each command type can use different timeout values after timeouts. ie. the initial timeout is set to a value which should cover most of the cases but not too long so that run away cases don't delay things too much. After the first try times out, the second try can use longer timeout and if that one times out too, it can go for full 30sec timeout. IDENTIFYs use 5s - 10s - 30s timeout and all other commands use 5s - 10s timeouts. This patch significantly cuts down the needed time to handle failure cases while still allowing libata to work with nut job devices through retries. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 16 ++++-- drivers/ata/libata-eh.c | 121 +++++++++++++++++++++++++++++++++++++++++++++- drivers/ata/libata.h | 2 + include/linux/libata.h | 8 ++- 4 files changed, 142 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index c5c3b1b516e..9bef1a84fe3 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -144,7 +144,7 @@ static int libata_dma_mask = ATA_DMA_MASK_ATA|ATA_DMA_MASK_ATAPI|ATA_DMA_MASK_CF module_param_named(dma, libata_dma_mask, int, 0444); MODULE_PARM_DESC(dma, "DMA enable/disable (0x1==ATA, 0x2==ATAPI, 0x4==CF)"); -static int ata_probe_timeout = ATA_TMOUT_INTERNAL / 1000; +static int ata_probe_timeout; module_param(ata_probe_timeout, int, 0444); MODULE_PARM_DESC(ata_probe_timeout, "Set ATA probing timeout (seconds)"); @@ -1611,6 +1611,7 @@ unsigned ata_exec_internal_sg(struct ata_device *dev, struct ata_link *link = dev->link; struct ata_port *ap = link->ap; u8 command = tf->command; + int auto_timeout = 0; struct ata_queued_cmd *qc; unsigned int tag, preempted_tag; u32 preempted_sactive, preempted_qc_active; @@ -1683,8 +1684,14 @@ unsigned ata_exec_internal_sg(struct ata_device *dev, spin_unlock_irqrestore(ap->lock, flags); - if (!timeout) - timeout = ata_probe_timeout * 1000; + if (!timeout) { + if (ata_probe_timeout) + timeout = ata_probe_timeout * 1000; + else { + timeout = ata_internal_cmd_timeout(dev, command); + auto_timeout = 1; + } + } rc = wait_for_completion_timeout(&wait, msecs_to_jiffies(timeout)); @@ -1760,6 +1767,9 @@ unsigned ata_exec_internal_sg(struct ata_device *dev, spin_unlock_irqrestore(ap->lock, flags); + if ((err_mask & AC_ERR_TIMEOUT) && auto_timeout) + ata_internal_cmd_timed_out(dev, command); + return err_mask; } diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 83d1451fa71..d5f03a6e333 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -67,6 +67,8 @@ enum { ATA_ECAT_DUBIOUS_UNK_DEV = 7, ATA_ECAT_NR = 8, + ATA_EH_CMD_DFL_TIMEOUT = 5000, + /* always put at least this amount of time between resets */ ATA_EH_RESET_COOL_DOWN = 5000, @@ -93,6 +95,53 @@ static const unsigned long ata_eh_reset_timeouts[] = { ULONG_MAX, /* > 1 min has elapsed, give up */ }; +static const unsigned long ata_eh_identify_timeouts[] = { + 5000, /* covers > 99% of successes and not too boring on failures */ + 10000, /* combined time till here is enough even for media access */ + 30000, /* for true idiots */ + ULONG_MAX, +}; + +static const unsigned long ata_eh_other_timeouts[] = { + 5000, /* same rationale as identify timeout */ + 10000, /* ditto */ + /* but no merciful 30sec for other commands, it just isn't worth it */ + ULONG_MAX, +}; + +struct ata_eh_cmd_timeout_ent { + const u8 *commands; + const unsigned long *timeouts; +}; + +/* The following table determines timeouts to use for EH internal + * commands. Each table entry is a command class and matches the + * commands the entry applies to and the timeout table to use. + * + * On the retry after a command timed out, the next timeout value from + * the table is used. If the table doesn't contain further entries, + * the last value is used. + * + * ehc->cmd_timeout_idx keeps track of which timeout to use per + * command class, so if SET_FEATURES times out on the first try, the + * next try will use the second timeout value only for that class. + */ +#define CMDS(cmds...) (const u8 []){ cmds, 0 } +static const struct ata_eh_cmd_timeout_ent +ata_eh_cmd_timeout_table[ATA_EH_CMD_TIMEOUT_TABLE_SIZE] = { + { .commands = CMDS(ATA_CMD_ID_ATA, ATA_CMD_ID_ATAPI), + .timeouts = ata_eh_identify_timeouts, }, + { .commands = CMDS(ATA_CMD_READ_NATIVE_MAX, ATA_CMD_READ_NATIVE_MAX_EXT), + .timeouts = ata_eh_other_timeouts, }, + { .commands = CMDS(ATA_CMD_SET_MAX, ATA_CMD_SET_MAX_EXT), + .timeouts = ata_eh_other_timeouts, }, + { .commands = CMDS(ATA_CMD_SET_FEATURES), + .timeouts = ata_eh_other_timeouts, }, + { .commands = CMDS(ATA_CMD_INIT_DEV_PARAMS), + .timeouts = ata_eh_other_timeouts, }, +}; +#undef CMDS + static void __ata_port_freeze(struct ata_port *ap); #ifdef CONFIG_PM static void ata_eh_handle_port_suspend(struct ata_port *ap); @@ -238,6 +287,73 @@ void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset, #endif /* CONFIG_PCI */ +static int ata_lookup_timeout_table(u8 cmd) +{ + int i; + + for (i = 0; i < ATA_EH_CMD_TIMEOUT_TABLE_SIZE; i++) { + const u8 *cur; + + for (cur = ata_eh_cmd_timeout_table[i].commands; *cur; cur++) + if (*cur == cmd) + return i; + } + + return -1; +} + +/** + * ata_internal_cmd_timeout - determine timeout for an internal command + * @dev: target device + * @cmd: internal command to be issued + * + * Determine timeout for internal command @cmd for @dev. + * + * LOCKING: + * EH context. + * + * RETURNS: + * Determined timeout. + */ +unsigned long ata_internal_cmd_timeout(struct ata_device *dev, u8 cmd) +{ + struct ata_eh_context *ehc = &dev->link->eh_context; + int ent = ata_lookup_timeout_table(cmd); + int idx; + + if (ent < 0) + return ATA_EH_CMD_DFL_TIMEOUT; + + idx = ehc->cmd_timeout_idx[dev->devno][ent]; + return ata_eh_cmd_timeout_table[ent].timeouts[idx]; +} + +/** + * ata_internal_cmd_timed_out - notification for internal command timeout + * @dev: target device + * @cmd: internal command which timed out + * + * Notify EH that internal command @cmd for @dev timed out. This + * function should be called only for commands whose timeouts are + * determined using ata_internal_cmd_timeout(). + * + * LOCKING: + * EH context. + */ +void ata_internal_cmd_timed_out(struct ata_device *dev, u8 cmd) +{ + struct ata_eh_context *ehc = &dev->link->eh_context; + int ent = ata_lookup_timeout_table(cmd); + int idx; + + if (ent < 0) + return; + + idx = ehc->cmd_timeout_idx[dev->devno][ent]; + if (ata_eh_cmd_timeout_table[ent].timeouts[idx + 1] != ULONG_MAX) + ehc->cmd_timeout_idx[dev->devno][ent]++; +} + static void ata_ering_record(struct ata_ering *ering, unsigned int eflags, unsigned int err_mask) { @@ -2600,8 +2716,11 @@ static int ata_eh_handle_dev_fail(struct ata_device *dev, int err) ata_eh_detach_dev(dev); /* schedule probe if necessary */ - if (ata_eh_schedule_probe(dev)) + if (ata_eh_schedule_probe(dev)) { ehc->tries[dev->devno] = ATA_EH_DEV_TRIES; + memset(ehc->cmd_timeout_idx[dev->devno], 0, + sizeof(ehc->cmd_timeout_idx[dev->devno])); + } return 1; } else { diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h index 1cf803adbc9..f6f9c28ec7f 100644 --- a/drivers/ata/libata.h +++ b/drivers/ata/libata.h @@ -151,6 +151,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work); extern int ata_bus_probe(struct ata_port *ap); /* libata-eh.c */ +extern unsigned long ata_internal_cmd_timeout(struct ata_device *dev, u8 cmd); +extern void ata_internal_cmd_timed_out(struct ata_device *dev, u8 cmd); extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd); extern void ata_scsi_error(struct Scsi_Host *host); extern void ata_port_wait_eh(struct ata_port *ap); diff --git a/include/linux/libata.h b/include/linux/libata.h index 9058c2a325a..035f8e1cd0a 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -237,7 +237,6 @@ enum { /* various lengths of time */ ATA_TMOUT_BOOT = 30000, /* heuristic */ ATA_TMOUT_BOOT_QUICK = 7000, /* heuristic */ - ATA_TMOUT_INTERNAL = 30000, ATA_TMOUT_INTERNAL_QUICK = 5000, /* FIXME: GoVault needs 2s but we can't afford that without @@ -341,6 +340,11 @@ enum { SATA_PMP_RW_TIMEOUT = 3000, /* PMP read/write timeout */ + /* This should match the actual table size of + * ata_eh_cmd_timeout_table in libata-eh.c. + */ + ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 5, + /* Horkage types. May be set by libata or controller on drives (some horkage may be drive/controller pair dependant */ @@ -598,6 +602,8 @@ struct ata_eh_info { struct ata_eh_context { struct ata_eh_info i; int tries[ATA_MAX_DEVICES]; + int cmd_timeout_idx[ATA_MAX_DEVICES] + [ATA_EH_CMD_TIMEOUT_TABLE_SIZE]; unsigned int classes[ATA_MAX_DEVICES]; unsigned int did_probe_mask; unsigned int saved_ncq_enabled; -- cgit v1.2.3-70-g09d2 From 18f7ba4c2f4be6b37d925931f04d6cc28d88d1ee Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Tue, 3 Jun 2008 10:33:55 -0700 Subject: libata/ahci: enclosure management support Add Enclosure Management support to libata and ahci. Signed-off-by: Kristen Carlson Accardi Signed-off-by: Jeff Garzik --- drivers/ata/ahci.c | 321 +++++++++++++++++++++++++++++++++++++++++++++- drivers/ata/libata-scsi.c | 79 ++++++++++++ include/linux/libata.h | 21 +++ 3 files changed, 419 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 5e6468a7ca4..65d4e968feb 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -56,6 +56,12 @@ MODULE_PARM_DESC(skip_host_reset, "skip global host reset (0=don't skip, 1=skip) static int ahci_enable_alpm(struct ata_port *ap, enum link_pm policy); static void ahci_disable_alpm(struct ata_port *ap); +static ssize_t ahci_led_show(struct ata_port *ap, char *buf); +static ssize_t ahci_led_store(struct ata_port *ap, const char *buf, + size_t size); +static ssize_t ahci_transmit_led_message(struct ata_port *ap, u32 state, + ssize_t size); +#define MAX_SLOTS 8 enum { AHCI_PCI_BAR = 5, @@ -98,6 +104,8 @@ enum { HOST_IRQ_STAT = 0x08, /* interrupt status */ HOST_PORTS_IMPL = 0x0c, /* bitmap of implemented ports */ HOST_VERSION = 0x10, /* AHCI spec. version compliancy */ + HOST_EM_LOC = 0x1c, /* Enclosure Management location */ + HOST_EM_CTL = 0x20, /* Enclosure Management Control */ /* HOST_CTL bits */ HOST_RESET = (1 << 0), /* reset controller; self-clear */ @@ -105,6 +113,7 @@ enum { HOST_AHCI_EN = (1 << 31), /* AHCI enabled */ /* HOST_CAP bits */ + HOST_CAP_EMS = (1 << 6), /* Enclosure Management support */ HOST_CAP_SSC = (1 << 14), /* Slumber capable */ HOST_CAP_PMP = (1 << 17), /* Port Multiplier support */ HOST_CAP_CLO = (1 << 24), /* Command List Override support */ @@ -202,6 +211,11 @@ enum { ATA_FLAG_IPM, ICH_MAP = 0x90, /* ICH MAP register */ + + /* em_ctl bits */ + EM_CTL_RST = (1 << 9), /* Reset */ + EM_CTL_TM = (1 << 8), /* Transmit Message */ + EM_CTL_ALHD = (1 << 26), /* Activity LED */ }; struct ahci_cmd_hdr { @@ -219,12 +233,21 @@ struct ahci_sg { __le32 flags_size; }; +struct ahci_em_priv { + enum sw_activity blink_policy; + struct timer_list timer; + unsigned long saved_activity; + unsigned long activity; + unsigned long led_state; +}; + struct ahci_host_priv { unsigned int flags; /* AHCI_HFLAG_* */ u32 cap; /* cap to use */ u32 port_map; /* port map to use */ u32 saved_cap; /* saved initial cap */ u32 saved_port_map; /* saved initial port_map */ + u32 em_loc; /* enclosure management location */ }; struct ahci_port_priv { @@ -240,6 +263,8 @@ struct ahci_port_priv { unsigned int ncq_saw_dmas:1; unsigned int ncq_saw_sdb:1; u32 intr_mask; /* interrupts to enable */ + struct ahci_em_priv em_priv[MAX_SLOTS];/* enclosure management info + * per PM slot */ }; static int ahci_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val); @@ -277,9 +302,20 @@ static int ahci_port_suspend(struct ata_port *ap, pm_message_t mesg); static int ahci_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg); static int ahci_pci_device_resume(struct pci_dev *pdev); #endif +static ssize_t ahci_activity_show(struct ata_device *dev, char *buf); +static ssize_t ahci_activity_store(struct ata_device *dev, + enum sw_activity val); +static void ahci_init_sw_activity(struct ata_link *link); static struct device_attribute *ahci_shost_attrs[] = { &dev_attr_link_power_management_policy, + &dev_attr_em_message_type, + &dev_attr_em_message, + NULL +}; + +static struct device_attribute *ahci_sdev_attrs[] = { + &dev_attr_sw_activity, NULL }; @@ -289,6 +325,7 @@ static struct scsi_host_template ahci_sht = { .sg_tablesize = AHCI_MAX_SG, .dma_boundary = AHCI_DMA_BOUNDARY, .shost_attrs = ahci_shost_attrs, + .sdev_attrs = ahci_sdev_attrs, }; static struct ata_port_operations ahci_ops = { @@ -316,6 +353,10 @@ static struct ata_port_operations ahci_ops = { .enable_pm = ahci_enable_alpm, .disable_pm = ahci_disable_alpm, + .em_show = ahci_led_show, + .em_store = ahci_led_store, + .sw_activity_show = ahci_activity_show, + .sw_activity_store = ahci_activity_store, #ifdef CONFIG_PM .port_suspend = ahci_port_suspend, .port_resume = ahci_port_resume, @@ -561,6 +602,11 @@ static struct pci_driver ahci_pci_driver = { #endif }; +static int ahci_em_messages = 1; +module_param(ahci_em_messages, int, 0444); +/* add other LED protocol types when they become supported */ +MODULE_PARM_DESC(ahci_em_messages, + "Set AHCI Enclosure Management Message type (0 = disabled, 1 = LED"); static inline int ahci_nr_ports(u32 cap) { @@ -1031,11 +1077,28 @@ static void ahci_power_down(struct ata_port *ap) static void ahci_start_port(struct ata_port *ap) { + struct ahci_port_priv *pp = ap->private_data; + struct ata_link *link; + struct ahci_em_priv *emp; + /* enable FIS reception */ ahci_start_fis_rx(ap); /* enable DMA */ ahci_start_engine(ap); + + /* turn on LEDs */ + if (ap->flags & ATA_FLAG_EM) { + ata_port_for_each_link(link, ap) { + emp = &pp->em_priv[link->pmp]; + ahci_transmit_led_message(ap, emp->led_state, 4); + } + } + + if (ap->flags & ATA_FLAG_SW_ACTIVITY) + ata_port_for_each_link(link, ap) + ahci_init_sw_activity(link); + } static int ahci_deinit_port(struct ata_port *ap, const char **emsg) @@ -1116,6 +1179,230 @@ static int ahci_reset_controller(struct ata_host *host) return 0; } +static void ahci_sw_activity(struct ata_link *link) +{ + struct ata_port *ap = link->ap; + struct ahci_port_priv *pp = ap->private_data; + struct ahci_em_priv *emp = &pp->em_priv[link->pmp]; + + if (!(link->flags & ATA_LFLAG_SW_ACTIVITY)) + return; + + emp->activity++; + if (!timer_pending(&emp->timer)) + mod_timer(&emp->timer, jiffies + msecs_to_jiffies(10)); +} + +static void ahci_sw_activity_blink(unsigned long arg) +{ + struct ata_link *link = (struct ata_link *)arg; + struct ata_port *ap = link->ap; + struct ahci_port_priv *pp = ap->private_data; + struct ahci_em_priv *emp = &pp->em_priv[link->pmp]; + unsigned long led_message = emp->led_state; + u32 activity_led_state; + + led_message &= 0xffff0000; + led_message |= ap->port_no | (link->pmp << 8); + + /* check to see if we've had activity. If so, + * toggle state of LED and reset timer. If not, + * turn LED to desired idle state. + */ + if (emp->saved_activity != emp->activity) { + emp->saved_activity = emp->activity; + /* get the current LED state */ + activity_led_state = led_message & 0x00010000; + + if (activity_led_state) + activity_led_state = 0; + else + activity_led_state = 1; + + /* clear old state */ + led_message &= 0xfff8ffff; + + /* toggle state */ + led_message |= (activity_led_state << 16); + mod_timer(&emp->timer, jiffies + msecs_to_jiffies(100)); + } else { + /* switch to idle */ + led_message &= 0xfff8ffff; + if (emp->blink_policy == BLINK_OFF) + led_message |= (1 << 16); + } + ahci_transmit_led_message(ap, led_message, 4); +} + +static void ahci_init_sw_activity(struct ata_link *link) +{ + struct ata_port *ap = link->ap; + struct ahci_port_priv *pp = ap->private_data; + struct ahci_em_priv *emp = &pp->em_priv[link->pmp]; + + /* init activity stats, setup timer */ + emp->saved_activity = emp->activity = 0; + setup_timer(&emp->timer, ahci_sw_activity_blink, (unsigned long)link); + + /* check our blink policy and set flag for link if it's enabled */ + if (emp->blink_policy) + link->flags |= ATA_LFLAG_SW_ACTIVITY; +} + +static int ahci_reset_em(struct ata_host *host) +{ + void __iomem *mmio = host->iomap[AHCI_PCI_BAR]; + u32 em_ctl; + + em_ctl = readl(mmio + HOST_EM_CTL); + if ((em_ctl & EM_CTL_TM) || (em_ctl & EM_CTL_RST)) + return -EINVAL; + + writel(em_ctl | EM_CTL_RST, mmio + HOST_EM_CTL); + return 0; +} + +static ssize_t ahci_transmit_led_message(struct ata_port *ap, u32 state, + ssize_t size) +{ + struct ahci_host_priv *hpriv = ap->host->private_data; + struct ahci_port_priv *pp = ap->private_data; + void __iomem *mmio = ap->host->iomap[AHCI_PCI_BAR]; + u32 em_ctl; + u32 message[] = {0, 0}; + unsigned int flags; + int pmp; + struct ahci_em_priv *emp; + + /* get the slot number from the message */ + pmp = (state & 0x0000ff00) >> 8; + if (pmp < MAX_SLOTS) + emp = &pp->em_priv[pmp]; + else + return -EINVAL; + + spin_lock_irqsave(ap->lock, flags); + + /* + * if we are still busy transmitting a previous message, + * do not allow + */ + em_ctl = readl(mmio + HOST_EM_CTL); + if (em_ctl & EM_CTL_TM) { + spin_unlock_irqrestore(ap->lock, flags); + return -EINVAL; + } + + /* + * create message header - this is all zero except for + * the message size, which is 4 bytes. + */ + message[0] |= (4 << 8); + + /* ignore 0:4 of byte zero, fill in port info yourself */ + message[1] = ((state & 0xfffffff0) | ap->port_no); + + /* write message to EM_LOC */ + writel(message[0], mmio + hpriv->em_loc); + writel(message[1], mmio + hpriv->em_loc+4); + + /* save off new led state for port/slot */ + emp->led_state = message[1]; + + /* + * tell hardware to transmit the message + */ + writel(em_ctl | EM_CTL_TM, mmio + HOST_EM_CTL); + + spin_unlock_irqrestore(ap->lock, flags); + return size; +} + +static ssize_t ahci_led_show(struct ata_port *ap, char *buf) +{ + struct ahci_port_priv *pp = ap->private_data; + struct ata_link *link; + struct ahci_em_priv *emp; + int rc = 0; + + ata_port_for_each_link(link, ap) { + emp = &pp->em_priv[link->pmp]; + rc += sprintf(buf, "%lx\n", emp->led_state); + } + return rc; +} + +static ssize_t ahci_led_store(struct ata_port *ap, const char *buf, + size_t size) +{ + int state; + int pmp; + struct ahci_port_priv *pp = ap->private_data; + struct ahci_em_priv *emp; + + state = simple_strtoul(buf, NULL, 0); + + /* get the slot number from the message */ + pmp = (state & 0x0000ff00) >> 8; + if (pmp < MAX_SLOTS) + emp = &pp->em_priv[pmp]; + else + return -EINVAL; + + /* mask off the activity bits if we are in sw_activity + * mode, user should turn off sw_activity before setting + * activity led through em_message + */ + if (emp->blink_policy) + state &= 0xfff8ffff; + + return ahci_transmit_led_message(ap, state, size); +} + +static ssize_t ahci_activity_store(struct ata_device *dev, enum sw_activity val) +{ + struct ata_link *link = dev->link; + struct ata_port *ap = link->ap; + struct ahci_port_priv *pp = ap->private_data; + struct ahci_em_priv *emp = &pp->em_priv[link->pmp]; + u32 port_led_state = emp->led_state; + + /* save the desired Activity LED behavior */ + if (val == OFF) { + /* clear LFLAG */ + link->flags &= ~(ATA_LFLAG_SW_ACTIVITY); + + /* set the LED to OFF */ + port_led_state &= 0xfff80000; + port_led_state |= (ap->port_no | (link->pmp << 8)); + ahci_transmit_led_message(ap, port_led_state, 4); + } else { + link->flags |= ATA_LFLAG_SW_ACTIVITY; + if (val == BLINK_OFF) { + /* set LED to ON for idle */ + port_led_state &= 0xfff80000; + port_led_state |= (ap->port_no | (link->pmp << 8)); + port_led_state |= 0x00010000; /* check this */ + ahci_transmit_led_message(ap, port_led_state, 4); + } + } + emp->blink_policy = val; + return 0; +} + +static ssize_t ahci_activity_show(struct ata_device *dev, char *buf) +{ + struct ata_link *link = dev->link; + struct ata_port *ap = link->ap; + struct ahci_port_priv *pp = ap->private_data; + struct ahci_em_priv *emp = &pp->em_priv[link->pmp]; + + /* display the saved value of activity behavior for this + * disk. + */ + return sprintf(buf, "%d\n", emp->blink_policy); +} + static void ahci_port_init(struct pci_dev *pdev, struct ata_port *ap, int port_no, void __iomem *mmio, void __iomem *port_mmio) @@ -1848,6 +2135,8 @@ static unsigned int ahci_qc_issue(struct ata_queued_cmd *qc) writel(1 << qc->tag, port_mmio + PORT_CMD_ISSUE); readl(port_mmio + PORT_CMD_ISSUE); /* flush */ + ahci_sw_activity(qc->dev->link); + return 0; } @@ -2154,7 +2443,8 @@ static void ahci_print_info(struct ata_host *host) dev_printk(KERN_INFO, &pdev->dev, "flags: " "%s%s%s%s%s%s%s" - "%s%s%s%s%s%s%s\n" + "%s%s%s%s%s%s%s" + "%s\n" , cap & (1 << 31) ? "64bit " : "", @@ -2171,7 +2461,8 @@ static void ahci_print_info(struct ata_host *host) cap & (1 << 17) ? "pmp " : "", cap & (1 << 15) ? "pio " : "", cap & (1 << 14) ? "slum " : "", - cap & (1 << 13) ? "part " : "" + cap & (1 << 13) ? "part " : "", + cap & (1 << 6) ? "ems ": "" ); } @@ -2291,6 +2582,24 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (hpriv->cap & HOST_CAP_PMP) pi.flags |= ATA_FLAG_PMP; + if (ahci_em_messages && (hpriv->cap & HOST_CAP_EMS)) { + u8 messages; + void __iomem *mmio = pcim_iomap_table(pdev)[AHCI_PCI_BAR]; + u32 em_loc = readl(mmio + HOST_EM_LOC); + u32 em_ctl = readl(mmio + HOST_EM_CTL); + + messages = (em_ctl & 0x000f0000) >> 16; + + /* we only support LED message type right now */ + if ((messages & 0x01) && (ahci_em_messages == 1)) { + /* store em_loc */ + hpriv->em_loc = ((em_loc >> 16) * 4); + pi.flags |= ATA_FLAG_EM; + if (!(em_ctl & EM_CTL_ALHD)) + pi.flags |= ATA_FLAG_SW_ACTIVITY; + } + } + /* CAP.NP sometimes indicate the index of the last enabled * port, at other times, that of the last possible port, so * determining the maximum port number requires looking at @@ -2304,6 +2613,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) host->iomap = pcim_iomap_table(pdev); host->private_data = hpriv; + if (pi.flags & ATA_FLAG_EM) + ahci_reset_em(host); + for (i = 0; i < host->n_ports; i++) { struct ata_port *ap = host->ports[i]; @@ -2314,6 +2626,11 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* set initial link pm policy */ ap->pm_policy = NOT_AVAILABLE; + /* set enclosure management message type */ + if (ap->flags & ATA_FLAG_EM) + ap->em_message_type = ahci_em_messages; + + /* disabled/not-implemented port */ if (!(hpriv->port_map & (1 << i))) ap->ops = &ata_dummy_port_ops; diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 57a43649a46..b578b11caa7 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -190,6 +190,85 @@ static void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq) scsi_build_sense_buffer(0, cmd->sense_buffer, sk, asc, ascq); } +static ssize_t +ata_scsi_em_message_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + if (ap->ops->em_store && (ap->flags & ATA_FLAG_EM)) + return ap->ops->em_store(ap, buf, count); + return -EINVAL; +} + +static ssize_t +ata_scsi_em_message_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + + if (ap->ops->em_show && (ap->flags & ATA_FLAG_EM)) + return ap->ops->em_show(ap, buf); + return -EINVAL; +} +DEVICE_ATTR(em_message, S_IRUGO | S_IWUGO, + ata_scsi_em_message_show, ata_scsi_em_message_store); +EXPORT_SYMBOL_GPL(dev_attr_em_message); + +static ssize_t +ata_scsi_em_message_type_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct Scsi_Host *shost = class_to_shost(dev); + struct ata_port *ap = ata_shost_to_port(shost); + + return snprintf(buf, 23, "%d\n", ap->em_message_type); +} +DEVICE_ATTR(em_message_type, S_IRUGO, + ata_scsi_em_message_type_show, NULL); +EXPORT_SYMBOL_GPL(dev_attr_em_message_type); + +static ssize_t +ata_scsi_activity_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct scsi_device *sdev = to_scsi_device(dev); + struct ata_port *ap = ata_shost_to_port(sdev->host); + struct ata_device *atadev = ata_scsi_find_dev(ap, sdev); + + if (ap->ops->sw_activity_show && (ap->flags & ATA_FLAG_SW_ACTIVITY)) + return ap->ops->sw_activity_show(atadev, buf); + return -EINVAL; +} + +static ssize_t +ata_scsi_activity_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct scsi_device *sdev = to_scsi_device(dev); + struct ata_port *ap = ata_shost_to_port(sdev->host); + struct ata_device *atadev = ata_scsi_find_dev(ap, sdev); + enum sw_activity val; + int rc; + + if (ap->ops->sw_activity_store && (ap->flags & ATA_FLAG_SW_ACTIVITY)) { + val = simple_strtoul(buf, NULL, 0); + switch (val) { + case OFF: case BLINK_ON: case BLINK_OFF: + rc = ap->ops->sw_activity_store(atadev, val); + if (!rc) + return count; + else + return rc; + } + } + return -EINVAL; +} +DEVICE_ATTR(sw_activity, S_IWUGO | S_IRUGO, ata_scsi_activity_show, + ata_scsi_activity_store); +EXPORT_SYMBOL_GPL(dev_attr_sw_activity); + static void ata_scsi_invalid_field(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)) { diff --git a/include/linux/libata.h b/include/linux/libata.h index 035f8e1cd0a..5b247b8a6b3 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -169,6 +169,7 @@ enum { ATA_LFLAG_ASSUME_CLASS = ATA_LFLAG_ASSUME_ATA | ATA_LFLAG_ASSUME_SEMB, ATA_LFLAG_NO_RETRY = (1 << 5), /* don't retry this link */ ATA_LFLAG_DISABLED = (1 << 6), /* link is disabled */ + ATA_LFLAG_SW_ACTIVITY = (1 << 7), /* keep activity stats */ /* struct ata_port flags */ ATA_FLAG_SLAVE_POSS = (1 << 0), /* host supports slave dev */ @@ -191,6 +192,10 @@ enum { ATA_FLAG_AN = (1 << 18), /* controller supports AN */ ATA_FLAG_PMP = (1 << 19), /* controller supports PMP */ ATA_FLAG_IPM = (1 << 20), /* driver can handle IPM */ + ATA_FLAG_EM = (1 << 21), /* driver supports enclosure + * management */ + ATA_FLAG_SW_ACTIVITY = (1 << 22), /* driver supports sw activity + * led */ /* The following flag belongs to ap->pflags but is kept in * ap->flags because it's referenced in many LLDs and will be @@ -446,6 +451,15 @@ enum link_pm { MEDIUM_POWER, }; extern struct device_attribute dev_attr_link_power_management_policy; +extern struct device_attribute dev_attr_em_message_type; +extern struct device_attribute dev_attr_em_message; +extern struct device_attribute dev_attr_sw_activity; + +enum sw_activity { + OFF, + BLINK_ON, + BLINK_OFF, +}; #ifdef CONFIG_ATA_SFF struct ata_ioports { @@ -701,6 +715,7 @@ struct ata_port { struct timer_list fastdrain_timer; unsigned long fastdrain_cnt; + int em_message_type; void *private_data; #ifdef CONFIG_ATA_ACPI @@ -792,6 +807,12 @@ struct ata_port_operations { u8 (*bmdma_status)(struct ata_port *ap); #endif /* CONFIG_ATA_SFF */ + ssize_t (*em_show)(struct ata_port *ap, char *buf); + ssize_t (*em_store)(struct ata_port *ap, const char *message, + size_t size); + ssize_t (*sw_activity_show)(struct ata_device *dev, char *buf); + ssize_t (*sw_activity_store)(struct ata_device *dev, + enum sw_activity val); /* * Obsolete */ -- cgit v1.2.3-70-g09d2 From 20a9b6e7c303f2a6f9afe17c0997bc9a3c734442 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 14 Jul 2008 22:38:22 +0200 Subject: i2c: Remove 3 deprecated bus drivers This patch contains the scheduled removal of i2c-i810, i2c-prosavage and i2c-savage4. Signed-off-by: Adrian Bunk Signed-off-by: Jean Delvare --- Documentation/feature-removal-schedule.txt | 7 - Documentation/i2c/busses/i2c-i810 | 47 ----- Documentation/i2c/busses/i2c-prosavage | 23 -- Documentation/i2c/busses/i2c-savage4 | 26 --- drivers/i2c/busses/Kconfig | 52 ----- drivers/i2c/busses/Makefile | 3 - drivers/i2c/busses/i2c-i810.c | 260 ----------------------- drivers/i2c/busses/i2c-prosavage.c | 325 ----------------------------- drivers/i2c/busses/i2c-savage4.c | 185 ---------------- include/linux/i2c-id.h | 1 - 10 files changed, 929 deletions(-) delete mode 100644 Documentation/i2c/busses/i2c-i810 delete mode 100644 Documentation/i2c/busses/i2c-prosavage delete mode 100644 Documentation/i2c/busses/i2c-savage4 delete mode 100644 drivers/i2c/busses/i2c-i810.c delete mode 100644 drivers/i2c/busses/i2c-prosavage.c delete mode 100644 drivers/i2c/busses/i2c-savage4.c (limited to 'include/linux') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 46ece3fba6f..65a1482457a 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -222,13 +222,6 @@ Who: Thomas Gleixner --------------------------- -What: i2c-i810, i2c-prosavage and i2c-savage4 -When: May 2008 -Why: These drivers are superseded by i810fb, intelfb and savagefb. -Who: Jean Delvare - ---------------------------- - What (Why): - include/linux/netfilter_ipv4/ipt_TOS.h ipt_tos.h header files (superseded by xt_TOS/xt_tos target & match) diff --git a/Documentation/i2c/busses/i2c-i810 b/Documentation/i2c/busses/i2c-i810 deleted file mode 100644 index 778210ee158..00000000000 --- a/Documentation/i2c/busses/i2c-i810 +++ /dev/null @@ -1,47 +0,0 @@ -Kernel driver i2c-i810 - -Supported adapters: - * Intel 82810, 82810-DC100, 82810E, and 82815 (GMCH) - * Intel 82845G (GMCH) - -Authors: - Frodo Looijaard , - Philip Edelbrock , - Kyösti Mälkki , - Ralph Metzler , - Mark D. Studebaker - -Main contact: Mark Studebaker - -Description ------------ - -WARNING: If you have an '810' or '815' motherboard, your standard I2C -temperature sensors are most likely on the 801's I2C bus. You want the -i2c-i801 driver for those, not this driver. - -Now for the i2c-i810... - -The GMCH chip contains two I2C interfaces. - -The first interface is used for DDC (Data Display Channel) which is a -serial channel through the VGA monitor connector to a DDC-compliant -monitor. This interface is defined by the Video Electronics Standards -Association (VESA). The standards are available for purchase at -http://www.vesa.org . - -The second interface is a general-purpose I2C bus. It may be connected to a -TV-out chip such as the BT869 or possibly to a digital flat-panel display. - -Features --------- - -Both busses use the i2c-algo-bit driver for 'bit banging' -and support for specific transactions is provided by i2c-algo-bit. - -Issues ------- - -If you enable bus testing in i2c-algo-bit (insmod i2c-algo-bit bit_test=1), -the test may fail; if so, the i2c-i810 driver won't be inserted. However, -we think this has been fixed. diff --git a/Documentation/i2c/busses/i2c-prosavage b/Documentation/i2c/busses/i2c-prosavage deleted file mode 100644 index 70368790251..00000000000 --- a/Documentation/i2c/busses/i2c-prosavage +++ /dev/null @@ -1,23 +0,0 @@ -Kernel driver i2c-prosavage - -Supported adapters: - - S3/VIA KM266/VT8375 aka ProSavage8 - S3/VIA KM133/VT8365 aka Savage4 - -Author: Henk Vergonet - -Description ------------ - -The Savage4 chips contain two I2C interfaces (aka a I2C 'master' or -'host'). - -The first interface is used for DDC (Data Display Channel) which is a -serial channel through the VGA monitor connector to a DDC-compliant -monitor. This interface is defined by the Video Electronics Standards -Association (VESA). The standards are available for purchase at -http://www.vesa.org . The second interface is a general-purpose I2C bus. - -Usefull for gaining access to the TV Encoder chips. - diff --git a/Documentation/i2c/busses/i2c-savage4 b/Documentation/i2c/busses/i2c-savage4 deleted file mode 100644 index 6ecceab618d..00000000000 --- a/Documentation/i2c/busses/i2c-savage4 +++ /dev/null @@ -1,26 +0,0 @@ -Kernel driver i2c-savage4 - -Supported adapters: - * Savage4 - * Savage2000 - -Authors: - Alexander Wold , - Mark D. Studebaker - -Description ------------ - -The Savage4 chips contain two I2C interfaces (aka a I2C 'master' -or 'host'). - -The first interface is used for DDC (Data Display Channel) which is a -serial channel through the VGA monitor connector to a DDC-compliant -monitor. This interface is defined by the Video Electronics Standards -Association (VESA). The standards are available for purchase at -http://www.vesa.org . The DDC bus is not yet supported because its register -is not directly memory-mapped. - -The second interface is a general-purpose I2C bus. This is the only -interface supported by the driver at the moment. - diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 00d76e13588..b7cce921183 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -186,26 +186,6 @@ config I2C_I801 This driver can also be built as a module. If so, the module will be called i2c-i801. -config I2C_I810 - tristate "Intel 810/815 (DEPRECATED)" - default n - depends on PCI - select I2C_ALGOBIT - help - If you say yes to this option, support will be included for the Intel - 810/815 family of mainboard I2C interfaces. Specifically, the - following versions of the chipset are supported: - i810AA - i810AB - i810E - i815 - i845G - - This driver is deprecated in favor of the i810fb and intelfb drivers. - - This driver can also be built as a module. If so, the module - will be called i2c-i810. - config I2C_PXA tristate "Intel PXA2XX I2C adapter (EXPERIMENTAL)" depends on EXPERIMENTAL && ARCH_PXA @@ -402,24 +382,6 @@ config I2C_PASEMI help Supports the PA Semi PWRficient on-chip SMBus interfaces. -config I2C_PROSAVAGE - tristate "S3/VIA (Pro)Savage (DEPRECATED)" - default n - depends on PCI - select I2C_ALGOBIT - help - If you say yes to this option, support will be included for the - I2C bus and DDC bus of the S3VIA embedded Savage4 and ProSavage8 - graphics processors. - chipsets supported: - S3/VIA KM266/VT8375 aka ProSavage8 - S3/VIA KM133/VT8365 aka Savage4 - - This driver is deprecated in favor of the savagefb driver. - - This support is also available as a module. If so, the module - will be called i2c-prosavage. - config I2C_S3C2410 tristate "S3C2410 I2C Driver" depends on ARCH_S3C2410 @@ -427,20 +389,6 @@ config I2C_S3C2410 Say Y here to include support for I2C controller in the Samsung S3C2410 based System-on-Chip devices. -config I2C_SAVAGE4 - tristate "S3 Savage 4 (DEPRECATED)" - default n - depends on PCI - select I2C_ALGOBIT - help - If you say yes to this option, support will be included for the - S3 Savage 4 I2C interface. - - This driver is deprecated in favor of the savagefb driver. - - This driver can also be built as a module. If so, the module - will be called i2c-savage4. - config I2C_SIBYTE tristate "SiByte SMBus interface" depends on SIBYTE_SB1xxx_SOC diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index 8b0a8c25790..81bb407d24c 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -16,7 +16,6 @@ obj-$(CONFIG_I2C_ELEKTOR) += i2c-elektor.o obj-$(CONFIG_I2C_GPIO) += i2c-gpio.o obj-$(CONFIG_I2C_HYDRA) += i2c-hydra.o obj-$(CONFIG_I2C_I801) += i2c-i801.o -obj-$(CONFIG_I2C_I810) += i2c-i810.o obj-$(CONFIG_I2C_IBM_IIC) += i2c-ibm_iic.o obj-$(CONFIG_I2C_IOP3XX) += i2c-iop3xx.o obj-$(CONFIG_I2C_IXP2000) += i2c-ixp2000.o @@ -35,10 +34,8 @@ obj-$(CONFIG_I2C_PCA_PLATFORM) += i2c-pca-platform.o obj-$(CONFIG_I2C_PIIX4) += i2c-piix4.o obj-$(CONFIG_I2C_PMCMSP) += i2c-pmcmsp.o obj-$(CONFIG_I2C_PNX) += i2c-pnx.o -obj-$(CONFIG_I2C_PROSAVAGE) += i2c-prosavage.o obj-$(CONFIG_I2C_PXA) += i2c-pxa.o obj-$(CONFIG_I2C_S3C2410) += i2c-s3c2410.o -obj-$(CONFIG_I2C_SAVAGE4) += i2c-savage4.o obj-$(CONFIG_I2C_SH7760) += i2c-sh7760.o obj-$(CONFIG_I2C_SH_MOBILE) += i2c-sh_mobile.o obj-$(CONFIG_I2C_SIBYTE) += i2c-sibyte.o diff --git a/drivers/i2c/busses/i2c-i810.c b/drivers/i2c/busses/i2c-i810.c deleted file mode 100644 index 42e8d94c276..00000000000 --- a/drivers/i2c/busses/i2c-i810.c +++ /dev/null @@ -1,260 +0,0 @@ -/* - i2c-i810.c - Part of lm_sensors, Linux kernel modules for hardware - monitoring - Copyright (c) 1998, 1999, 2000 Frodo Looijaard , - Philip Edelbrock , - Ralph Metzler , and - Mark D. Studebaker - - Based on code written by Ralph Metzler and - Simon Vogl - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ -/* - This interfaces to the I810/I815 to provide access to - the DDC Bus and the I2C Bus. - - SUPPORTED DEVICES PCI ID - i810AA 7121 - i810AB 7123 - i810E 7125 - i815 1132 - i845G 2562 -*/ - -#include -#include -#include -#include -#include -#include -#include - -/* GPIO register locations */ -#define I810_IOCONTROL_OFFSET 0x5000 -#define I810_HVSYNC 0x00 /* not used */ -#define I810_GPIOA 0x10 -#define I810_GPIOB 0x14 - -/* bit locations in the registers */ -#define SCL_DIR_MASK 0x0001 -#define SCL_DIR 0x0002 -#define SCL_VAL_MASK 0x0004 -#define SCL_VAL_OUT 0x0008 -#define SCL_VAL_IN 0x0010 -#define SDA_DIR_MASK 0x0100 -#define SDA_DIR 0x0200 -#define SDA_VAL_MASK 0x0400 -#define SDA_VAL_OUT 0x0800 -#define SDA_VAL_IN 0x1000 - -/* initialization states */ -#define INIT1 0x1 -#define INIT2 0x2 -#define INIT3 0x4 - -/* delays */ -#define CYCLE_DELAY 10 -#define TIMEOUT (HZ / 2) - -static void __iomem *ioaddr; - -/* The i810 GPIO registers have individual masks for each bit - so we never have to read before writing. Nice. */ - -static void bit_i810i2c_setscl(void *data, int val) -{ - writel((val ? SCL_VAL_OUT : 0) | SCL_DIR | SCL_DIR_MASK | SCL_VAL_MASK, - ioaddr + I810_GPIOB); - readl(ioaddr + I810_GPIOB); /* flush posted write */ -} - -static void bit_i810i2c_setsda(void *data, int val) -{ - writel((val ? SDA_VAL_OUT : 0) | SDA_DIR | SDA_DIR_MASK | SDA_VAL_MASK, - ioaddr + I810_GPIOB); - readl(ioaddr + I810_GPIOB); /* flush posted write */ -} - -/* The GPIO pins are open drain, so the pins could always remain outputs. - However, some chip versions don't latch the inputs unless they - are set as inputs. - We rely on the i2c-algo-bit routines to set the pins high before - reading the input from other chips. Following guidance in the 815 - prog. ref. guide, we do a "dummy write" of 0 to the register before - reading which forces the input value to be latched. We presume this - applies to the 810 as well; shouldn't hurt anyway. This is necessary to get - i2c_algo_bit bit_test=1 to pass. */ - -static int bit_i810i2c_getscl(void *data) -{ - writel(SCL_DIR_MASK, ioaddr + I810_GPIOB); - writel(0, ioaddr + I810_GPIOB); - return (0 != (readl(ioaddr + I810_GPIOB) & SCL_VAL_IN)); -} - -static int bit_i810i2c_getsda(void *data) -{ - writel(SDA_DIR_MASK, ioaddr + I810_GPIOB); - writel(0, ioaddr + I810_GPIOB); - return (0 != (readl(ioaddr + I810_GPIOB) & SDA_VAL_IN)); -} - -static void bit_i810ddc_setscl(void *data, int val) -{ - writel((val ? SCL_VAL_OUT : 0) | SCL_DIR | SCL_DIR_MASK | SCL_VAL_MASK, - ioaddr + I810_GPIOA); - readl(ioaddr + I810_GPIOA); /* flush posted write */ -} - -static void bit_i810ddc_setsda(void *data, int val) -{ - writel((val ? SDA_VAL_OUT : 0) | SDA_DIR | SDA_DIR_MASK | SDA_VAL_MASK, - ioaddr + I810_GPIOA); - readl(ioaddr + I810_GPIOA); /* flush posted write */ -} - -static int bit_i810ddc_getscl(void *data) -{ - writel(SCL_DIR_MASK, ioaddr + I810_GPIOA); - writel(0, ioaddr + I810_GPIOA); - return (0 != (readl(ioaddr + I810_GPIOA) & SCL_VAL_IN)); -} - -static int bit_i810ddc_getsda(void *data) -{ - writel(SDA_DIR_MASK, ioaddr + I810_GPIOA); - writel(0, ioaddr + I810_GPIOA); - return (0 != (readl(ioaddr + I810_GPIOA) & SDA_VAL_IN)); -} - -static int config_i810(struct pci_dev *dev) -{ - unsigned long cadr; - - /* map I810 memory */ - cadr = dev->resource[1].start; - cadr += I810_IOCONTROL_OFFSET; - cadr &= PCI_BASE_ADDRESS_MEM_MASK; - ioaddr = ioremap_nocache(cadr, 0x1000); - if (ioaddr) { - bit_i810i2c_setscl(NULL, 1); - bit_i810i2c_setsda(NULL, 1); - bit_i810ddc_setscl(NULL, 1); - bit_i810ddc_setsda(NULL, 1); - return 0; - } - return -ENODEV; -} - -static struct i2c_algo_bit_data i810_i2c_bit_data = { - .setsda = bit_i810i2c_setsda, - .setscl = bit_i810i2c_setscl, - .getsda = bit_i810i2c_getsda, - .getscl = bit_i810i2c_getscl, - .udelay = CYCLE_DELAY, - .timeout = TIMEOUT, -}; - -static struct i2c_adapter i810_i2c_adapter = { - .owner = THIS_MODULE, - .id = I2C_HW_B_I810, - .name = "I810/I815 I2C Adapter", - .algo_data = &i810_i2c_bit_data, -}; - -static struct i2c_algo_bit_data i810_ddc_bit_data = { - .setsda = bit_i810ddc_setsda, - .setscl = bit_i810ddc_setscl, - .getsda = bit_i810ddc_getsda, - .getscl = bit_i810ddc_getscl, - .udelay = CYCLE_DELAY, - .timeout = TIMEOUT, -}; - -static struct i2c_adapter i810_ddc_adapter = { - .owner = THIS_MODULE, - .id = I2C_HW_B_I810, - .name = "I810/I815 DDC Adapter", - .algo_data = &i810_ddc_bit_data, -}; - -static struct pci_device_id i810_ids[] __devinitdata = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82810_IG1) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82810_IG3) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82810E_IG) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82815_CGC) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82845G_IG) }, - { 0, }, -}; - -MODULE_DEVICE_TABLE (pci, i810_ids); - -static int __devinit i810_probe(struct pci_dev *dev, const struct pci_device_id *id) -{ - int retval; - - retval = config_i810(dev); - if (retval) - return retval; - dev_info(&dev->dev, "i810/i815 i2c device found.\n"); - - /* set up the sysfs linkage to our parent device */ - i810_i2c_adapter.dev.parent = &dev->dev; - i810_ddc_adapter.dev.parent = &dev->dev; - - retval = i2c_bit_add_bus(&i810_i2c_adapter); - if (retval) - return retval; - retval = i2c_bit_add_bus(&i810_ddc_adapter); - if (retval) - i2c_del_adapter(&i810_i2c_adapter); - return retval; -} - -static void __devexit i810_remove(struct pci_dev *dev) -{ - i2c_del_adapter(&i810_ddc_adapter); - i2c_del_adapter(&i810_i2c_adapter); - iounmap(ioaddr); -} - -static struct pci_driver i810_driver = { - .name = "i810_smbus", - .id_table = i810_ids, - .probe = i810_probe, - .remove = __devexit_p(i810_remove), -}; - -static int __init i2c_i810_init(void) -{ - return pci_register_driver(&i810_driver); -} - -static void __exit i2c_i810_exit(void) -{ - pci_unregister_driver(&i810_driver); -} - -MODULE_AUTHOR("Frodo Looijaard , " - "Philip Edelbrock , " - "Ralph Metzler , " - "and Mark D. Studebaker "); -MODULE_DESCRIPTION("I810/I815 I2C/DDC driver"); -MODULE_LICENSE("GPL"); - -module_init(i2c_i810_init); -module_exit(i2c_i810_exit); diff --git a/drivers/i2c/busses/i2c-prosavage.c b/drivers/i2c/busses/i2c-prosavage.c deleted file mode 100644 index 07c1f1e27df..00000000000 --- a/drivers/i2c/busses/i2c-prosavage.c +++ /dev/null @@ -1,325 +0,0 @@ -/* - * kernel/busses/i2c-prosavage.c - * - * i2c bus driver for S3/VIA 8365/8375 graphics processor. - * Copyright (c) 2003 Henk Vergonet - * Based on code written by: - * Frodo Looijaard , - * Philip Edelbrock , - * Ralph Metzler , and - * Mark D. Studebaker - * Simon Vogl - * and others - * - * Please read the lm_sensors documentation for details on use. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ -/* 18-05-2003 HVE - created - * 14-06-2003 HVE - adapted for lm_sensors2 - * 17-06-2003 HVE - linux 2.5.xx compatible - * 18-06-2003 HVE - codingstyle - * 21-06-2003 HVE - compatibility lm_sensors2 and linux 2.5.xx - * codingstyle, mmio enabled - * - * This driver interfaces to the I2C bus of the VIA north bridge embedded - * ProSavage4/8 devices. Usefull for gaining access to the TV Encoder chips. - * - * Graphics cores: - * S3/VIA KM266/VT8375 aka ProSavage8 - * S3/VIA KM133/VT8365 aka Savage4 - * - * Two serial busses are implemented: - * SERIAL1 - I2C serial communications interface - * SERIAL2 - DDC2 monitor communications interface - * - * Tested on a FX41 mainboard, see http://www.shuttle.com - * - * - * TODO: - * - integration with prosavage framebuffer device - * (Additional documentation needed :( - */ - -#include -#include -#include -#include -#include -#include - -/* - * driver configuration - */ -#define MAX_BUSSES 2 - -struct s_i2c_bus { - void __iomem *mmvga; - int i2c_reg; - int adap_ok; - struct i2c_adapter adap; - struct i2c_algo_bit_data algo; -}; - -struct s_i2c_chip { - void __iomem *mmio; - struct s_i2c_bus i2c_bus[MAX_BUSSES]; -}; - - -/* - * i2c configuration - */ -#define CYCLE_DELAY 10 -#define TIMEOUT (HZ / 2) - - -/* - * S3/VIA 8365/8375 registers - */ -#define VGA_CR_IX 0x3d4 -#define VGA_CR_DATA 0x3d5 - -#define CR_SERIAL1 0xa0 /* I2C serial communications interface */ -#define MM_SERIAL1 0xff20 -#define CR_SERIAL2 0xb1 /* DDC2 monitor communications interface */ - -/* based on vt8365 documentation */ -#define I2C_ENAB 0x10 -#define I2C_SCL_OUT 0x01 -#define I2C_SDA_OUT 0x02 -#define I2C_SCL_IN 0x04 -#define I2C_SDA_IN 0x08 - -#define SET_CR_IX(p, val) writeb((val), (p)->mmvga + VGA_CR_IX) -#define SET_CR_DATA(p, val) writeb((val), (p)->mmvga + VGA_CR_DATA) -#define GET_CR_DATA(p) readb((p)->mmvga + VGA_CR_DATA) - - -/* - * Serial bus line handling - * - * serial communications register as parameter in private data - * - * TODO: locks with other code sections accessing video registers? - */ -static void bit_s3via_setscl(void *bus, int val) -{ - struct s_i2c_bus *p = (struct s_i2c_bus *)bus; - unsigned int r; - - SET_CR_IX(p, p->i2c_reg); - r = GET_CR_DATA(p); - r |= I2C_ENAB; - if (val) { - r |= I2C_SCL_OUT; - } else { - r &= ~I2C_SCL_OUT; - } - SET_CR_DATA(p, r); -} - -static void bit_s3via_setsda(void *bus, int val) -{ - struct s_i2c_bus *p = (struct s_i2c_bus *)bus; - unsigned int r; - - SET_CR_IX(p, p->i2c_reg); - r = GET_CR_DATA(p); - r |= I2C_ENAB; - if (val) { - r |= I2C_SDA_OUT; - } else { - r &= ~I2C_SDA_OUT; - } - SET_CR_DATA(p, r); -} - -static int bit_s3via_getscl(void *bus) -{ - struct s_i2c_bus *p = (struct s_i2c_bus *)bus; - - SET_CR_IX(p, p->i2c_reg); - return (0 != (GET_CR_DATA(p) & I2C_SCL_IN)); -} - -static int bit_s3via_getsda(void *bus) -{ - struct s_i2c_bus *p = (struct s_i2c_bus *)bus; - - SET_CR_IX(p, p->i2c_reg); - return (0 != (GET_CR_DATA(p) & I2C_SDA_IN)); -} - - -/* - * adapter initialisation - */ -static int i2c_register_bus(struct pci_dev *dev, struct s_i2c_bus *p, void __iomem *mmvga, u32 i2c_reg) -{ - int ret; - p->adap.owner = THIS_MODULE; - p->adap.id = I2C_HW_B_S3VIA; - p->adap.algo_data = &p->algo; - p->adap.dev.parent = &dev->dev; - p->algo.setsda = bit_s3via_setsda; - p->algo.setscl = bit_s3via_setscl; - p->algo.getsda = bit_s3via_getsda; - p->algo.getscl = bit_s3via_getscl; - p->algo.udelay = CYCLE_DELAY; - p->algo.timeout = TIMEOUT; - p->algo.data = p; - p->mmvga = mmvga; - p->i2c_reg = i2c_reg; - - ret = i2c_bit_add_bus(&p->adap); - if (ret) { - return ret; - } - - p->adap_ok = 1; - return 0; -} - - -/* - * Cleanup stuff - */ -static void prosavage_remove(struct pci_dev *dev) -{ - struct s_i2c_chip *chip; - int i, ret; - - chip = (struct s_i2c_chip *)pci_get_drvdata(dev); - - if (!chip) { - return; - } - for (i = MAX_BUSSES - 1; i >= 0; i--) { - if (chip->i2c_bus[i].adap_ok == 0) - continue; - - ret = i2c_del_adapter(&chip->i2c_bus[i].adap); - if (ret) { - dev_err(&dev->dev, "%s not removed\n", - chip->i2c_bus[i].adap.name); - } - } - if (chip->mmio) { - iounmap(chip->mmio); - } - kfree(chip); -} - - -/* - * Detect chip and initialize it - */ -static int __devinit prosavage_probe(struct pci_dev *dev, const struct pci_device_id *id) -{ - int ret; - unsigned long base, len; - struct s_i2c_chip *chip; - struct s_i2c_bus *bus; - - pci_set_drvdata(dev, kzalloc(sizeof(struct s_i2c_chip), GFP_KERNEL)); - chip = (struct s_i2c_chip *)pci_get_drvdata(dev); - if (chip == NULL) { - return -ENOMEM; - } - - base = dev->resource[0].start & PCI_BASE_ADDRESS_MEM_MASK; - len = dev->resource[0].end - base + 1; - chip->mmio = ioremap_nocache(base, len); - - if (chip->mmio == NULL) { - dev_err(&dev->dev, "ioremap failed\n"); - prosavage_remove(dev); - return -ENODEV; - } - - - /* - * Chip initialisation - */ - /* Unlock Extended IO Space ??? */ - - - /* - * i2c bus registration - */ - bus = &chip->i2c_bus[0]; - snprintf(bus->adap.name, sizeof(bus->adap.name), - "ProSavage I2C bus at %02x:%02x.%x", - dev->bus->number, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn)); - ret = i2c_register_bus(dev, bus, chip->mmio + 0x8000, CR_SERIAL1); - if (ret) { - goto err_adap; - } - /* - * ddc bus registration - */ - bus = &chip->i2c_bus[1]; - snprintf(bus->adap.name, sizeof(bus->adap.name), - "ProSavage DDC bus at %02x:%02x.%x", - dev->bus->number, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn)); - ret = i2c_register_bus(dev, bus, chip->mmio + 0x8000, CR_SERIAL2); - if (ret) { - goto err_adap; - } - return 0; -err_adap: - dev_err(&dev->dev, "%s failed\n", bus->adap.name); - prosavage_remove(dev); - return ret; -} - - -/* - * Data for PCI driver interface - */ -static struct pci_device_id prosavage_pci_tbl[] = { - { PCI_DEVICE(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_SAVAGE4) }, - { PCI_DEVICE(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_PROSAVAGE8) }, - { 0, }, -}; - -MODULE_DEVICE_TABLE (pci, prosavage_pci_tbl); - -static struct pci_driver prosavage_driver = { - .name = "prosavage_smbus", - .id_table = prosavage_pci_tbl, - .probe = prosavage_probe, - .remove = prosavage_remove, -}; - -static int __init i2c_prosavage_init(void) -{ - return pci_register_driver(&prosavage_driver); -} - -static void __exit i2c_prosavage_exit(void) -{ - pci_unregister_driver(&prosavage_driver); -} - -MODULE_DEVICE_TABLE(pci, prosavage_pci_tbl); -MODULE_AUTHOR("Henk Vergonet"); -MODULE_DESCRIPTION("ProSavage VIA 8365/8375 smbus driver"); -MODULE_LICENSE("GPL"); - -module_init (i2c_prosavage_init); -module_exit (i2c_prosavage_exit); diff --git a/drivers/i2c/busses/i2c-savage4.c b/drivers/i2c/busses/i2c-savage4.c deleted file mode 100644 index 8adf4abaa03..00000000000 --- a/drivers/i2c/busses/i2c-savage4.c +++ /dev/null @@ -1,185 +0,0 @@ -/* - i2c-savage4.c - Part of lm_sensors, Linux kernel modules for hardware - monitoring - Copyright (C) 1998-2003 The LM Sensors Team - Alexander Wold - Mark D. Studebaker - - Based on i2c-voodoo3.c. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -/* This interfaces to the I2C bus of the Savage4 to gain access to - the BT869 and possibly other I2C devices. The DDC bus is not - yet supported because its register is not memory-mapped. -*/ - -#include -#include -#include -#include -#include -#include -#include - -/* device IDs */ -#define PCI_CHIP_SAVAGE4 0x8A22 -#define PCI_CHIP_SAVAGE2000 0x9102 - -#define REG 0xff20 /* Serial Port 1 Register */ - -/* bit locations in the register */ -#define I2C_ENAB 0x00000020 -#define I2C_SCL_OUT 0x00000001 -#define I2C_SDA_OUT 0x00000002 -#define I2C_SCL_IN 0x00000008 -#define I2C_SDA_IN 0x00000010 - -/* delays */ -#define CYCLE_DELAY 10 -#define TIMEOUT (HZ / 2) - - -static void __iomem *ioaddr; - -/* The sav GPIO registers don't have individual masks for each bit - so we always have to read before writing. */ - -static void bit_savi2c_setscl(void *data, int val) -{ - unsigned int r; - r = readl(ioaddr + REG); - if(val) - r |= I2C_SCL_OUT; - else - r &= ~I2C_SCL_OUT; - writel(r, ioaddr + REG); - readl(ioaddr + REG); /* flush posted write */ -} - -static void bit_savi2c_setsda(void *data, int val) -{ - unsigned int r; - r = readl(ioaddr + REG); - if(val) - r |= I2C_SDA_OUT; - else - r &= ~I2C_SDA_OUT; - writel(r, ioaddr + REG); - readl(ioaddr + REG); /* flush posted write */ -} - -/* The GPIO pins are open drain, so the pins always remain outputs. - We rely on the i2c-algo-bit routines to set the pins high before - reading the input from other chips. */ - -static int bit_savi2c_getscl(void *data) -{ - return (0 != (readl(ioaddr + REG) & I2C_SCL_IN)); -} - -static int bit_savi2c_getsda(void *data) -{ - return (0 != (readl(ioaddr + REG) & I2C_SDA_IN)); -} - -/* Configures the chip */ - -static int config_s4(struct pci_dev *dev) -{ - unsigned long cadr; - - /* map memory */ - cadr = dev->resource[0].start; - cadr &= PCI_BASE_ADDRESS_MEM_MASK; - ioaddr = ioremap_nocache(cadr, 0x0080000); - if (ioaddr) { - /* writel(0x8160, ioaddr + REG2); */ - writel(0x00000020, ioaddr + REG); - dev_info(&dev->dev, "Using Savage4 at %p\n", ioaddr); - return 0; - } - return -ENODEV; -} - -static struct i2c_algo_bit_data sav_i2c_bit_data = { - .setsda = bit_savi2c_setsda, - .setscl = bit_savi2c_setscl, - .getsda = bit_savi2c_getsda, - .getscl = bit_savi2c_getscl, - .udelay = CYCLE_DELAY, - .timeout = TIMEOUT -}; - -static struct i2c_adapter savage4_i2c_adapter = { - .owner = THIS_MODULE, - .id = I2C_HW_B_SAVAGE, - .name = "I2C Savage4 adapter", - .algo_data = &sav_i2c_bit_data, -}; - -static struct pci_device_id savage4_ids[] __devinitdata = { - { PCI_DEVICE(PCI_VENDOR_ID_S3, PCI_CHIP_SAVAGE4) }, - { PCI_DEVICE(PCI_VENDOR_ID_S3, PCI_CHIP_SAVAGE2000) }, - { 0, } -}; - -MODULE_DEVICE_TABLE (pci, savage4_ids); - -static int __devinit savage4_probe(struct pci_dev *dev, const struct pci_device_id *id) -{ - int retval; - - retval = config_s4(dev); - if (retval) - return retval; - - /* set up the sysfs linkage to our parent device */ - savage4_i2c_adapter.dev.parent = &dev->dev; - - return i2c_bit_add_bus(&savage4_i2c_adapter); -} - -static void __devexit savage4_remove(struct pci_dev *dev) -{ - i2c_del_adapter(&savage4_i2c_adapter); - iounmap(ioaddr); -} - -static struct pci_driver savage4_driver = { - .name = "savage4_smbus", - .id_table = savage4_ids, - .probe = savage4_probe, - .remove = __devexit_p(savage4_remove), -}; - -static int __init i2c_savage4_init(void) -{ - return pci_register_driver(&savage4_driver); -} - -static void __exit i2c_savage4_exit(void) -{ - pci_unregister_driver(&savage4_driver); -} - -MODULE_AUTHOR("Alexander Wold " - "and Mark D. Studebaker "); -MODULE_DESCRIPTION("Savage4 I2C/SMBus driver"); -MODULE_LICENSE("GPL"); - -module_init(i2c_savage4_init); -module_exit(i2c_savage4_exit); diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h index 580acc93903..988e566d3ed 100644 --- a/include/linux/i2c-id.h +++ b/include/linux/i2c-id.h @@ -111,7 +111,6 @@ #define I2C_HW_B_RIVA 0x010010 /* Riva based graphics cards */ #define I2C_HW_B_IOC 0x010011 /* IOC bit-wiggling */ #define I2C_HW_B_IXP2000 0x010016 /* GPIO on IXP2000 systems */ -#define I2C_HW_B_S3VIA 0x010018 /* S3Via ProSavage adapter */ #define I2C_HW_B_ZR36067 0x010019 /* Zoran-36057/36067 based boards */ #define I2C_HW_B_PCILYNX 0x01001a /* TI PCILynx I2C adapter */ #define I2C_HW_B_CX2388x 0x01001b /* connexant 2388x based tv cards */ -- cgit v1.2.3-70-g09d2 From 67c2e66571c383404a5acd08189194da660da942 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 14 Jul 2008 22:38:23 +0200 Subject: i2c: Delete unused function i2c_smbus_write_quick Function i2c_smbus_write_quick has no users left, so we can delete it. Also update the list of these helper functions which are gone but could be added back if needed. Signed-off-by: Jean Delvare --- Documentation/i2c/smbus-protocol | 4 ++-- Documentation/i2c/writing-clients | 14 +++++++------- drivers/i2c/i2c-core.c | 7 ------- include/linux/i2c.h | 1 - 4 files changed, 9 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/Documentation/i2c/smbus-protocol b/Documentation/i2c/smbus-protocol index 03f08fb491c..24bfb65da17 100644 --- a/Documentation/i2c/smbus-protocol +++ b/Documentation/i2c/smbus-protocol @@ -42,8 +42,8 @@ Count (8 bits): A data byte containing the length of a block operation. [..]: Data sent by I2C device, as opposed to data sent by the host adapter. -SMBus Quick Command: i2c_smbus_write_quick() -============================================= +SMBus Quick Command +=================== This sends a single bit to the device, at the place of the Rd/Wr bit. diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients index ba5d1971f35..63722d3c9cd 100644 --- a/Documentation/i2c/writing-clients +++ b/Documentation/i2c/writing-clients @@ -569,7 +569,6 @@ SMBus communication in terms of it. Never use this function directly! - extern s32 i2c_smbus_write_quick(struct i2c_client * client, u8 value); extern s32 i2c_smbus_read_byte(struct i2c_client * client); extern s32 i2c_smbus_write_byte(struct i2c_client * client, u8 value); extern s32 i2c_smbus_read_byte_data(struct i2c_client * client, u8 command); @@ -578,20 +577,21 @@ SMBus communication extern s32 i2c_smbus_read_word_data(struct i2c_client * client, u8 command); extern s32 i2c_smbus_write_word_data(struct i2c_client * client, u8 command, u16 value); + extern s32 i2c_smbus_read_block_data(struct i2c_client * client, + u8 command, u8 *values); extern s32 i2c_smbus_write_block_data(struct i2c_client * client, u8 command, u8 length, u8 *values); extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client * client, u8 command, u8 length, u8 *values); - -These ones were removed in Linux 2.6.10 because they had no users, but could -be added back later if needed: - - extern s32 i2c_smbus_read_block_data(struct i2c_client * client, - u8 command, u8 *values); extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client * client, u8 command, u8 length, u8 *values); + +These ones were removed from i2c-core because they had no users, but could +be added back later if needed: + + extern s32 i2c_smbus_write_quick(struct i2c_client * client, u8 value); extern s32 i2c_smbus_process_call(struct i2c_client * client, u8 command, u16 value); extern s32 i2c_smbus_block_process_call(struct i2c_client *client, diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 937f1dcbf3d..3695a4a1ab7 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -1303,13 +1303,6 @@ static int i2c_smbus_check_pec(u8 cpec, struct i2c_msg *msg) return 0; } -s32 i2c_smbus_write_quick(struct i2c_client *client, u8 value) -{ - return i2c_smbus_xfer(client->adapter,client->addr,client->flags, - value,0,I2C_SMBUS_QUICK,NULL); -} -EXPORT_SYMBOL(i2c_smbus_write_quick); - s32 i2c_smbus_read_byte(struct i2c_client *client) { union i2c_smbus_data data; diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 8dc73013219..b3695f353f7 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -71,7 +71,6 @@ extern s32 i2c_smbus_xfer (struct i2c_adapter * adapter, u16 addr, /* Now follow the 'nice' access routines. These also document the calling conventions of smbus_access. */ -extern s32 i2c_smbus_write_quick(struct i2c_client * client, u8 value); extern s32 i2c_smbus_read_byte(struct i2c_client * client); extern s32 i2c_smbus_write_byte(struct i2c_client * client, u8 value); extern s32 i2c_smbus_read_byte_data(struct i2c_client * client, u8 command); -- cgit v1.2.3-70-g09d2 From ae7193f7fa3e1735ab70807eb6e35a2a6575623f Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 14 Jul 2008 22:38:24 +0200 Subject: i2c: Update stray references to smbus_access That function is actually named i2c_smbus_xfer. Signed-off-by: Jean Delvare --- include/linux/i2c.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index b3695f353f7..7c36d5188d3 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -69,7 +69,7 @@ extern s32 i2c_smbus_xfer (struct i2c_adapter * adapter, u16 addr, union i2c_smbus_data * data); /* Now follow the 'nice' access routines. These also document the calling - conventions of smbus_access. */ + conventions of i2c_smbus_xfer. */ extern s32 i2c_smbus_read_byte(struct i2c_client * client); extern s32 i2c_smbus_write_byte(struct i2c_client * client, u8 value); @@ -536,7 +536,7 @@ union i2c_smbus_data { /* and one more for user-space compatibility */ }; -/* smbus_access read or write markers */ +/* i2c_smbus_xfer read or write markers */ #define I2C_SMBUS_READ 1 #define I2C_SMBUS_WRITE 0 -- cgit v1.2.3-70-g09d2 From c1b6b4f2342d073698dfc2547240c35045a1d00e Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 14 Jul 2008 22:38:28 +0200 Subject: i2c: Let framebuffer drivers set their I2C bus class to DDC Let framebuffer drivers set their I2C bus class to DDC. Once this is done, we will be able to tell the eeprom driver to only probe for EDID EEPROMs on these buses. Signed-off-by: Jean Delvare --- drivers/video/fb_ddc.c | 1 + drivers/video/intelfb/intelfb_i2c.c | 12 +++++++----- drivers/video/matrox/i2c-matroxfb.c | 20 +++++++++++++++----- include/linux/i2c.h | 2 +- 4 files changed, 24 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/fb_ddc.c b/drivers/video/fb_ddc.c index a0df63289b5..0cf96eb8a60 100644 --- a/drivers/video/fb_ddc.c +++ b/drivers/video/fb_ddc.c @@ -106,6 +106,7 @@ unsigned char *fb_ddc_read(struct i2c_adapter *adapter) algo_data->setsda(algo_data->data, 1); algo_data->setscl(algo_data->data, 1); + adapter->class |= I2C_CLASS_DDC; return edid; } diff --git a/drivers/video/intelfb/intelfb_i2c.c b/drivers/video/intelfb/intelfb_i2c.c index ca95f09d8b4..fcf9fadbf57 100644 --- a/drivers/video/intelfb/intelfb_i2c.c +++ b/drivers/video/intelfb/intelfb_i2c.c @@ -100,7 +100,8 @@ static int intelfb_gpio_getsda(void *data) static int intelfb_setup_i2c_bus(struct intelfb_info *dinfo, struct intelfb_i2c_chan *chan, - const u32 reg, const char *name) + const u32 reg, const char *name, + int class) { int rc; @@ -108,6 +109,7 @@ static int intelfb_setup_i2c_bus(struct intelfb_info *dinfo, chan->reg = reg; snprintf(chan->adapter.name, sizeof(chan->adapter.name), "intelfb %s", name); + chan->adapter.class = class; chan->adapter.owner = THIS_MODULE; chan->adapter.id = I2C_HW_B_INTELFB; chan->adapter.algo_data = &chan->algo; @@ -145,7 +147,7 @@ void intelfb_create_i2c_busses(struct intelfb_info *dinfo) /* setup the DDC bus for analog output */ intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].ddc_bus, GPIOA, - "CRTDDC_A"); + "CRTDDC_A", I2C_CLASS_DDC); i++; /* need to add the output busses for each device @@ -159,9 +161,9 @@ void intelfb_create_i2c_busses(struct intelfb_info *dinfo) case INTEL_865G: dinfo->output[i].type = INTELFB_OUTPUT_DVO; intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].ddc_bus, - GPIOD, "DVODDC_D"); + GPIOD, "DVODDC_D", I2C_CLASS_DDC); intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].i2c_bus, - GPIOE, "DVOI2C_E"); + GPIOE, "DVOI2C_E", 0); i++; break; case INTEL_915G: @@ -174,7 +176,7 @@ void intelfb_create_i2c_busses(struct intelfb_info *dinfo) /* SDVO ports have a single control bus - 2 devices */ dinfo->output[i].type = INTELFB_OUTPUT_SDVO; intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].i2c_bus, - GPIOE, "SDVOCTRL_E"); + GPIOE, "SDVOCTRL_E", 0); /* TODO: initialize the SDVO */ /* I830SDVOInit(pScrn, i, DVOB); */ i++; diff --git a/drivers/video/matrox/i2c-matroxfb.c b/drivers/video/matrox/i2c-matroxfb.c index 4baab7be58d..75ee5a12e54 100644 --- a/drivers/video/matrox/i2c-matroxfb.c +++ b/drivers/video/matrox/i2c-matroxfb.c @@ -104,7 +104,9 @@ static struct i2c_algo_bit_data matrox_i2c_algo_template = }; static int i2c_bus_reg(struct i2c_bit_adapter* b, struct matrox_fb_info* minfo, - unsigned int data, unsigned int clock, const char* name) { + unsigned int data, unsigned int clock, const char *name, + int class) +{ int err; b->minfo = minfo; @@ -114,6 +116,7 @@ static int i2c_bus_reg(struct i2c_bit_adapter* b, struct matrox_fb_info* minfo, snprintf(b->adapter.name, sizeof(b->adapter.name), name, minfo->fbcon.node); i2c_set_adapdata(&b->adapter, b); + b->adapter.class = class; b->adapter.algo_data = &b->bac; b->adapter.dev.parent = &ACCESS_FBINFO(pcidev)->dev; b->bac = matrox_i2c_algo_template; @@ -159,22 +162,29 @@ static void* i2c_matroxfb_probe(struct matrox_fb_info* minfo) { switch (ACCESS_FBINFO(chip)) { case MGA_2064: case MGA_2164: - err = i2c_bus_reg(&m2info->ddc1, minfo, DDC1B_DATA, DDC1B_CLK, "DDC:fb%u #0"); + err = i2c_bus_reg(&m2info->ddc1, minfo, + DDC1B_DATA, DDC1B_CLK, + "DDC:fb%u #0", I2C_CLASS_DDC); break; default: - err = i2c_bus_reg(&m2info->ddc1, minfo, DDC1_DATA, DDC1_CLK, "DDC:fb%u #0"); + err = i2c_bus_reg(&m2info->ddc1, minfo, + DDC1_DATA, DDC1_CLK, + "DDC:fb%u #0", I2C_CLASS_DDC); break; } if (err) goto fail_ddc1; if (ACCESS_FBINFO(devflags.dualhead)) { - err = i2c_bus_reg(&m2info->ddc2, minfo, DDC2_DATA, DDC2_CLK, "DDC:fb%u #1"); + err = i2c_bus_reg(&m2info->ddc2, minfo, + DDC2_DATA, DDC2_CLK, + "DDC:fb%u #1", I2C_CLASS_DDC); if (err == -ENODEV) { printk(KERN_INFO "i2c-matroxfb: VGA->TV plug detected, DDC unavailable.\n"); } else if (err) printk(KERN_INFO "i2c-matroxfb: Could not register secondary output i2c bus. Continuing anyway.\n"); /* Register maven bus even on G450/G550 */ - err = i2c_bus_reg(&m2info->maven, minfo, MAT_DATA, MAT_CLK, "MAVEN:fb%u"); + err = i2c_bus_reg(&m2info->maven, minfo, + MAT_DATA, MAT_CLK, "MAVEN:fb%u", 0); if (err) printk(KERN_INFO "i2c-matroxfb: Could not register Maven i2c bus. Continuing anyway.\n"); } diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 7c36d5188d3..145797fe6a3 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -349,7 +349,7 @@ static inline void i2c_set_adapdata (struct i2c_adapter *dev, void *data) #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ #define I2C_CLASS_TV_ANALOG (1<<1) /* bttv + friends */ #define I2C_CLASS_TV_DIGITAL (1<<2) /* dvb cards */ -#define I2C_CLASS_DDC (1<<3) /* i2c-matroxfb ? */ +#define I2C_CLASS_DDC (1<<3) /* DDC bus on graphics adapters */ #define I2C_CLASS_CAM_ANALOG (1<<4) /* camera with analog CCD */ #define I2C_CLASS_CAM_DIGITAL (1<<5) /* most webcams */ #define I2C_CLASS_SOUND (1<<6) /* sound devices */ -- cgit v1.2.3-70-g09d2 From 3401b2fff38fbb8b73ea6bcc69a8370ae5d2a7a0 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 14 Jul 2008 22:38:29 +0200 Subject: i2c: Let bus drivers add SPD to their class Let general purpose I2C/SMBus bus drivers add SPD to their class. Once this is done, we will be able to tell the eeprom driver to only probe for SPD EEPROMs and similar on these buses. Note that I took a conservative approach here, adding I2C_CLASS_SPD to many drivers that have no idea whether they can host SPD EEPROMs or not. This is to make sure that the eeprom driver doesn't stop probing buses where SPD EEPROMs or equivalent live. So, bus driver maintainers and users should feel free to remove the SPD class from drivers those buses never have SPD EEPROMs or they don't want the eeprom driver to bind to them. Likewise, feel free to add the SPD class to any bus driver I might have missed. Signed-off-by: Jean Delvare --- drivers/i2c/busses/i2c-ali1535.c | 2 +- drivers/i2c/busses/i2c-ali1563.c | 2 +- drivers/i2c/busses/i2c-ali15x3.c | 2 +- drivers/i2c/busses/i2c-amd756.c | 2 +- drivers/i2c/busses/i2c-amd8111.c | 2 +- drivers/i2c/busses/i2c-cpm.c | 2 +- drivers/i2c/busses/i2c-elektor.c | 2 +- drivers/i2c/busses/i2c-gpio.c | 2 +- drivers/i2c/busses/i2c-i801.c | 2 +- drivers/i2c/busses/i2c-ibm_iic.c | 4 ++-- drivers/i2c/busses/i2c-iop3xx.c | 2 +- drivers/i2c/busses/i2c-isch.c | 2 +- drivers/i2c/busses/i2c-mpc.c | 2 +- drivers/i2c/busses/i2c-mv64xxx.c | 2 +- drivers/i2c/busses/i2c-nforce2.c | 2 +- drivers/i2c/busses/i2c-ocores.c | 2 +- drivers/i2c/busses/i2c-pasemi.c | 2 +- drivers/i2c/busses/i2c-piix4.c | 2 +- drivers/i2c/busses/i2c-pmcmsp.c | 2 +- drivers/i2c/busses/i2c-s3c2410.c | 2 +- drivers/i2c/busses/i2c-sibyte.c | 4 ++-- drivers/i2c/busses/i2c-sis5595.c | 2 +- drivers/i2c/busses/i2c-sis630.c | 2 +- drivers/i2c/busses/i2c-sis96x.c | 2 +- drivers/i2c/busses/i2c-stub.c | 2 +- drivers/i2c/busses/i2c-via.c | 2 +- drivers/i2c/busses/i2c-viapro.c | 2 +- drivers/i2c/busses/scx200_acb.c | 2 +- include/linux/i2c.h | 1 + 29 files changed, 31 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/busses/i2c-ali1535.c b/drivers/i2c/busses/i2c-ali1535.c index 704436cdec8..8d1d90ab3a9 100644 --- a/drivers/i2c/busses/i2c-ali1535.c +++ b/drivers/i2c/busses/i2c-ali1535.c @@ -473,7 +473,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter ali1535_adapter = { .owner = THIS_MODULE, .id = I2C_HW_SMBUS_ALI1535, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-ali1563.c b/drivers/i2c/busses/i2c-ali1563.c index da5a382eee9..4b55ae19db8 100644 --- a/drivers/i2c/busses/i2c-ali1563.c +++ b/drivers/i2c/busses/i2c-ali1563.c @@ -382,7 +382,7 @@ static const struct i2c_algorithm ali1563_algorithm = { static struct i2c_adapter ali1563_adapter = { .owner = THIS_MODULE, .id = I2C_HW_SMBUS_ALI1563, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &ali1563_algorithm, }; diff --git a/drivers/i2c/busses/i2c-ali15x3.c b/drivers/i2c/busses/i2c-ali15x3.c index 7b029b147a8..e922c3950fc 100644 --- a/drivers/i2c/busses/i2c-ali15x3.c +++ b/drivers/i2c/busses/i2c-ali15x3.c @@ -471,7 +471,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter ali15x3_adapter = { .owner = THIS_MODULE, .id = I2C_HW_SMBUS_ALI15X3, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-amd756.c b/drivers/i2c/busses/i2c-amd756.c index f0baea62067..bd4f6380fab 100644 --- a/drivers/i2c/busses/i2c-amd756.c +++ b/drivers/i2c/busses/i2c-amd756.c @@ -301,7 +301,7 @@ static const struct i2c_algorithm smbus_algorithm = { struct i2c_adapter amd756_smbus = { .owner = THIS_MODULE, .id = I2C_HW_SMBUS_AMD756, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-amd8111.c b/drivers/i2c/busses/i2c-amd8111.c index a4f687915de..0e18fe84601 100644 --- a/drivers/i2c/busses/i2c-amd8111.c +++ b/drivers/i2c/busses/i2c-amd8111.c @@ -383,7 +383,7 @@ static int __devinit amd8111_probe(struct pci_dev *dev, snprintf(smbus->adapter.name, sizeof(smbus->adapter.name), "SMBus2 AMD8111 adapter at %04x", smbus->base); smbus->adapter.id = I2C_HW_SMBUS_AMD8111; - smbus->adapter.class = I2C_CLASS_HWMON; + smbus->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; smbus->adapter.algo = &smbus_algorithm; smbus->adapter.algo_data = smbus; diff --git a/drivers/i2c/busses/i2c-cpm.c b/drivers/i2c/busses/i2c-cpm.c index 53af744a91c..8164de1f4d7 100644 --- a/drivers/i2c/busses/i2c-cpm.c +++ b/drivers/i2c/busses/i2c-cpm.c @@ -423,7 +423,7 @@ static const struct i2c_adapter cpm_ops = { .owner = THIS_MODULE, .name = "i2c-cpm", .algo = &cpm_i2c_algo, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, }; static int __devinit cpm_i2c_setup(struct cpm_i2c *cpm) diff --git a/drivers/i2c/busses/i2c-elektor.c b/drivers/i2c/busses/i2c-elektor.c index b7a9977b025..c251cf21a62 100644 --- a/drivers/i2c/busses/i2c-elektor.c +++ b/drivers/i2c/busses/i2c-elektor.c @@ -202,7 +202,7 @@ static struct i2c_algo_pcf_data pcf_isa_data = { static struct i2c_adapter pcf_isa_ops = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .id = I2C_HW_P_ELEK, .algo_data = &pcf_isa_data, .name = "i2c-elektor", diff --git a/drivers/i2c/busses/i2c-gpio.c b/drivers/i2c/busses/i2c-gpio.c index 7c1b762aa68..79b455a1f09 100644 --- a/drivers/i2c/busses/i2c-gpio.c +++ b/drivers/i2c/busses/i2c-gpio.c @@ -140,7 +140,7 @@ static int __init i2c_gpio_probe(struct platform_device *pdev) adap->owner = THIS_MODULE; snprintf(adap->name, sizeof(adap->name), "i2c-gpio%d", pdev->id); adap->algo_data = bit_data; - adap->class = I2C_CLASS_HWMON; + adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; adap->dev.parent = &pdev->dev; /* diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 213119211e5..9717ffe1292 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -573,7 +573,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter i801_adapter = { .owner = THIS_MODULE, .id = I2C_HW_SMBUS_I801, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c index 85dbf34382e..6f7bfdec3c6 100644 --- a/drivers/i2c/busses/i2c-ibm_iic.c +++ b/drivers/i2c/busses/i2c-ibm_iic.c @@ -740,7 +740,7 @@ static int __devinit iic_probe(struct ocp_device *ocp){ strcpy(adap->name, "IBM IIC"); i2c_set_adapdata(adap, dev); adap->id = I2C_HW_OCP; - adap->class = I2C_CLASS_HWMON; + adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; adap->algo = &iic_algo; adap->client_register = NULL; adap->client_unregister = NULL; @@ -934,7 +934,7 @@ static int __devinit iic_probe(struct of_device *ofdev, strlcpy(adap->name, "IBM IIC", sizeof(adap->name)); i2c_set_adapdata(adap, dev); adap->id = I2C_HW_OCP; - adap->class = I2C_CLASS_HWMON; + adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; adap->algo = &iic_algo; adap->timeout = 1; adap->nr = dev->idx; diff --git a/drivers/i2c/busses/i2c-iop3xx.c b/drivers/i2c/busses/i2c-iop3xx.c index 39884e79759..fc2714ac0c0 100644 --- a/drivers/i2c/busses/i2c-iop3xx.c +++ b/drivers/i2c/busses/i2c-iop3xx.c @@ -482,7 +482,7 @@ iop3xx_i2c_probe(struct platform_device *pdev) memcpy(new_adapter->name, pdev->name, strlen(pdev->name)); new_adapter->id = I2C_HW_IOP3XX; new_adapter->owner = THIS_MODULE; - new_adapter->class = I2C_CLASS_HWMON; + new_adapter->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; new_adapter->dev.parent = &pdev->dev; new_adapter->nr = pdev->id; diff --git a/drivers/i2c/busses/i2c-isch.c b/drivers/i2c/busses/i2c-isch.c index c9cd46b2269..8d648911a7f 100644 --- a/drivers/i2c/busses/i2c-isch.c +++ b/drivers/i2c/busses/i2c-isch.c @@ -251,7 +251,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter sch_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-mpc.c b/drivers/i2c/busses/i2c-mpc.c index a076129de7e..10b9342a36c 100644 --- a/drivers/i2c/busses/i2c-mpc.c +++ b/drivers/i2c/busses/i2c-mpc.c @@ -311,7 +311,7 @@ static struct i2c_adapter mpc_ops = { .name = "MPC adapter", .id = I2C_HW_MPC107, .algo = &mpc_algo, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .timeout = 1, }; diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c index 036e6a883e6..9e8118d2fe6 100644 --- a/drivers/i2c/busses/i2c-mv64xxx.c +++ b/drivers/i2c/busses/i2c-mv64xxx.c @@ -530,7 +530,7 @@ mv64xxx_i2c_probe(struct platform_device *pd) drv_data->adapter.id = I2C_HW_MV64XXX; drv_data->adapter.algo = &mv64xxx_i2c_algo; drv_data->adapter.owner = THIS_MODULE; - drv_data->adapter.class = I2C_CLASS_HWMON; + drv_data->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; drv_data->adapter.timeout = pdata->timeout; drv_data->adapter.nr = pd->id; platform_set_drvdata(pd, drv_data); diff --git a/drivers/i2c/busses/i2c-nforce2.c b/drivers/i2c/busses/i2c-nforce2.c index 081fdf3393f..2654f20d3a6 100644 --- a/drivers/i2c/busses/i2c-nforce2.c +++ b/drivers/i2c/busses/i2c-nforce2.c @@ -350,7 +350,7 @@ static int __devinit nforce2_probe_smb (struct pci_dev *dev, int bar, } smbus->adapter.owner = THIS_MODULE; smbus->adapter.id = I2C_HW_SMBUS_NFORCE2; - smbus->adapter.class = I2C_CLASS_HWMON; + smbus->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; smbus->adapter.algo = &smbus_algorithm; smbus->adapter.algo_data = smbus; smbus->adapter.dev.parent = &dev->dev; diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index f145692cbb7..51ca79bf648 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -205,7 +205,7 @@ static const struct i2c_algorithm ocores_algorithm = { static struct i2c_adapter ocores_adapter = { .owner = THIS_MODULE, .name = "i2c-ocores", - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &ocores_algorithm, }; diff --git a/drivers/i2c/busses/i2c-pasemi.c b/drivers/i2c/busses/i2c-pasemi.c index 1603c81e39d..adf0fbb902f 100644 --- a/drivers/i2c/busses/i2c-pasemi.c +++ b/drivers/i2c/busses/i2c-pasemi.c @@ -365,7 +365,7 @@ static int __devinit pasemi_smb_probe(struct pci_dev *dev, smbus->adapter.owner = THIS_MODULE; snprintf(smbus->adapter.name, sizeof(smbus->adapter.name), "PA Semi SMBus adapter at 0x%lx", smbus->base); - smbus->adapter.class = I2C_CLASS_HWMON; + smbus->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD; smbus->adapter.algo = &smbus_algorithm; smbus->adapter.algo_data = smbus; smbus->adapter.nr = PCI_FUNC(dev->devfn); diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index 2bde47509e1..85d69f3e624 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -402,7 +402,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter piix4_adapter = { .owner = THIS_MODULE, .id = I2C_HW_SMBUS_PIIX4, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-pmcmsp.c b/drivers/i2c/busses/i2c-pmcmsp.c index 63b3e2c11cf..dcf2045b522 100644 --- a/drivers/i2c/busses/i2c-pmcmsp.c +++ b/drivers/i2c/busses/i2c-pmcmsp.c @@ -622,7 +622,7 @@ static struct i2c_algorithm pmcmsptwi_algo = { static struct i2c_adapter pmcmsptwi_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &pmcmsptwi_algo, .name = DRV_NAME, }; diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c index 9e8c875437b..007390ad981 100644 --- a/drivers/i2c/busses/i2c-s3c2410.c +++ b/drivers/i2c/busses/i2c-s3c2410.c @@ -590,7 +590,7 @@ static struct s3c24xx_i2c s3c24xx_i2c = { .owner = THIS_MODULE, .algo = &s3c24xx_i2c_algorithm, .retries = 2, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, }, }; diff --git a/drivers/i2c/busses/i2c-sibyte.c b/drivers/i2c/busses/i2c-sibyte.c index 114634da6c6..ac8822e7a5b 100644 --- a/drivers/i2c/busses/i2c-sibyte.c +++ b/drivers/i2c/busses/i2c-sibyte.c @@ -156,7 +156,7 @@ static struct i2c_adapter sibyte_board_adapter[2] = { { .owner = THIS_MODULE, .id = I2C_HW_SIBYTE, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = NULL, .algo_data = &sibyte_board_data[0], .name = "SiByte SMBus 0", @@ -164,7 +164,7 @@ static struct i2c_adapter sibyte_board_adapter[2] = { { .owner = THIS_MODULE, .id = I2C_HW_SIBYTE, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = NULL, .algo_data = &sibyte_board_data[1], .name = "SiByte SMBus 1", diff --git a/drivers/i2c/busses/i2c-sis5595.c b/drivers/i2c/busses/i2c-sis5595.c index 328441bb547..f76944b384f 100644 --- a/drivers/i2c/busses/i2c-sis5595.c +++ b/drivers/i2c/busses/i2c-sis5595.c @@ -362,7 +362,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter sis5595_adapter = { .owner = THIS_MODULE, .id = I2C_HW_SMBUS_SIS5595, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-sis630.c b/drivers/i2c/busses/i2c-sis630.c index d7e6ff3e018..eb2b2181fed 100644 --- a/drivers/i2c/busses/i2c-sis630.c +++ b/drivers/i2c/busses/i2c-sis630.c @@ -462,7 +462,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter sis630_adapter = { .owner = THIS_MODULE, .id = I2C_HW_SMBUS_SIS630, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-sis96x.c b/drivers/i2c/busses/i2c-sis96x.c index cde8e588036..413e9e47772 100644 --- a/drivers/i2c/busses/i2c-sis96x.c +++ b/drivers/i2c/busses/i2c-sis96x.c @@ -244,7 +244,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter sis96x_adapter = { .owner = THIS_MODULE, .id = I2C_HW_SMBUS_SIS96X, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/i2c-stub.c b/drivers/i2c/busses/i2c-stub.c index e37ccd80f77..1b7b2af9403 100644 --- a/drivers/i2c/busses/i2c-stub.c +++ b/drivers/i2c/busses/i2c-stub.c @@ -140,7 +140,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter stub_adapter = { .owner = THIS_MODULE, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, .name = "SMBus stub driver", }; diff --git a/drivers/i2c/busses/i2c-via.c b/drivers/i2c/busses/i2c-via.c index 61716f6b14d..6517f8a6d91 100644 --- a/drivers/i2c/busses/i2c-via.c +++ b/drivers/i2c/busses/i2c-via.c @@ -87,7 +87,7 @@ static struct i2c_algo_bit_data bit_data = { static struct i2c_adapter vt586b_adapter = { .owner = THIS_MODULE, .id = I2C_HW_B_VIA, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .name = "VIA i2c", .algo_data = &bit_data, }; diff --git a/drivers/i2c/busses/i2c-viapro.c b/drivers/i2c/busses/i2c-viapro.c index c611905df00..7957ce51589 100644 --- a/drivers/i2c/busses/i2c-viapro.c +++ b/drivers/i2c/busses/i2c-viapro.c @@ -311,7 +311,7 @@ static const struct i2c_algorithm smbus_algorithm = { static struct i2c_adapter vt596_adapter = { .owner = THIS_MODULE, .id = I2C_HW_SMBUS_VIA2, - .class = I2C_CLASS_HWMON, + .class = I2C_CLASS_HWMON | I2C_CLASS_SPD, .algo = &smbus_algorithm, }; diff --git a/drivers/i2c/busses/scx200_acb.c b/drivers/i2c/busses/scx200_acb.c index 61abe0f3325..ed794b145a1 100644 --- a/drivers/i2c/busses/scx200_acb.c +++ b/drivers/i2c/busses/scx200_acb.c @@ -442,7 +442,7 @@ static __init struct scx200_acb_iface *scx200_create_iface(const char *text, adapter->owner = THIS_MODULE; adapter->id = I2C_HW_SMBUS_SCX200; adapter->algo = &scx200_acb_algorithm; - adapter->class = I2C_CLASS_HWMON; + adapter->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; adapter->dev.parent = dev; mutex_init(&iface->mutex); diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 145797fe6a3..839d0ea3dca 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -353,6 +353,7 @@ static inline void i2c_set_adapdata (struct i2c_adapter *dev, void *data) #define I2C_CLASS_CAM_ANALOG (1<<4) /* camera with analog CCD */ #define I2C_CLASS_CAM_DIGITAL (1<<5) /* most webcams */ #define I2C_CLASS_SOUND (1<<6) /* sound devices */ +#define I2C_CLASS_SPD (1<<7) /* SPD EEPROMs and similar */ #define I2C_CLASS_ALL (UINT_MAX) /* all of the above */ /* i2c_client_address_data is the struct for holding default client -- cgit v1.2.3-70-g09d2 From 0573d11b2bbd0e4774f33f4c1959c1939c055e96 Mon Sep 17 00:00:00 2001 From: Eric Brower Date: Mon, 14 Jul 2008 22:38:31 +0200 Subject: i2c-algo-pcf: Multi-master lost-arbitration improvement Improve lost-arbitration handling of PCF8584. This is necessary for support of a currently out-of-kernel driver for Sun Microsystems E250 environmental management; perhaps others. Signed-off-by: Eric Brower Acked-by: Dan Smolik Signed-off-by: Jean Delvare --- drivers/i2c/algos/i2c-algo-pcf.c | 48 ++++++++++++++++++++++++++-------------- include/linux/i2c-algo-pcf.h | 6 +++++ 2 files changed, 37 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/algos/i2c-algo-pcf.c b/drivers/i2c/algos/i2c-algo-pcf.c index 8907b019167..1e328d19cd6 100644 --- a/drivers/i2c/algos/i2c-algo-pcf.c +++ b/drivers/i2c/algos/i2c-algo-pcf.c @@ -78,6 +78,36 @@ static void i2c_stop(struct i2c_algo_pcf_data *adap) set_pcf(adap, 1, I2C_PCF_STOP); } +static void handle_lab(struct i2c_algo_pcf_data *adap, const int *status) +{ + DEB2(printk(KERN_INFO + "i2c-algo-pcf.o: lost arbitration (CSR 0x%02x)\n", + *status)); + + /* Cleanup from LAB -- reset and enable ESO. + * This resets the PCF8584; since we've lost the bus, no + * further attempts should be made by callers to clean up + * (no i2c_stop() etc.) + */ + set_pcf(adap, 1, I2C_PCF_PIN); + set_pcf(adap, 1, I2C_PCF_ESO); + + /* We pause for a time period sufficient for any running + * I2C transaction to complete -- the arbitration logic won't + * work properly until the next START is seen. + * It is assumed the bus driver or client has set a proper value. + * + * REVISIT: should probably use msleep instead of mdelay if we + * know we can sleep. + */ + if (adap->lab_mdelay) + mdelay(adap->lab_mdelay); + + DEB2(printk(KERN_INFO + "i2c-algo-pcf.o: reset LAB condition (CSR 0x%02x)\n", + get_pcf(adap, 1))); +} + static int wait_for_bb(struct i2c_algo_pcf_data *adap) { int timeout = DEF_TIMEOUT; @@ -109,23 +139,7 @@ static int wait_for_pin(struct i2c_algo_pcf_data *adap, int *status) { *status = get_pcf(adap, 1); } if (*status & I2C_PCF_LAB) { - DEB2(printk(KERN_INFO - "i2c-algo-pcf.o: lost arbitration (CSR 0x%02x)\n", - *status)); - /* Cleanup from LAB-- reset and enable ESO. - * This resets the PCF8584; since we've lost the bus, no - * further attempts should be made by callers to clean up - * (no i2c_stop() etc.) - */ - set_pcf(adap, 1, I2C_PCF_PIN); - set_pcf(adap, 1, I2C_PCF_ESO); - /* TODO: we should pause for a time period sufficient for any - * running I2C transaction to complete-- the arbitration - * logic won't work properly until the next START is seen. - */ - DEB2(printk(KERN_INFO - "i2c-algo-pcf.o: reset LAB condition (CSR 0x%02x)\n", - get_pcf(adap,1))); + handle_lab(adap, status); return(-EINTR); } #endif diff --git a/include/linux/i2c-algo-pcf.h b/include/linux/i2c-algo-pcf.h index 77afbb60fd1..74fb6f889a7 100644 --- a/include/linux/i2c-algo-pcf.h +++ b/include/linux/i2c-algo-pcf.h @@ -36,6 +36,12 @@ struct i2c_algo_pcf_data { /* local settings */ int udelay; int timeout; + + /* Multi-master lost arbitration back-off delay (msecs) + * This should be set by the bus adapter or knowledgable client + * if bus is multi-mastered, else zero + */ + unsigned long lab_mdelay; }; int i2c_pcf_add_bus(struct i2c_adapter *); -- cgit v1.2.3-70-g09d2 From e3e7fc3c401a5d53f0599a357b3cf65d6a4f52e3 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 14 Jul 2008 22:38:31 +0200 Subject: i2c-algo-pcf: Drop unused struct members Struct members udelay and timeout aren't used anywhere, so drop them. Signed-off-by: Jean Delvare Acked-by: Eric Brower --- drivers/i2c/busses/i2c-elektor.c | 2 -- include/linux/i2c-algo-pcf.h | 4 ---- 2 files changed, 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/busses/i2c-elektor.c b/drivers/i2c/busses/i2c-elektor.c index c251cf21a62..7f38c01fb3a 100644 --- a/drivers/i2c/busses/i2c-elektor.c +++ b/drivers/i2c/busses/i2c-elektor.c @@ -196,8 +196,6 @@ static struct i2c_algo_pcf_data pcf_isa_data = { .getown = pcf_isa_getown, .getclock = pcf_isa_getclock, .waitforpin = pcf_isa_waitforpin, - .udelay = 10, - .timeout = 100, }; static struct i2c_adapter pcf_isa_ops = { diff --git a/include/linux/i2c-algo-pcf.h b/include/linux/i2c-algo-pcf.h index 74fb6f889a7..0177d280f73 100644 --- a/include/linux/i2c-algo-pcf.h +++ b/include/linux/i2c-algo-pcf.h @@ -33,10 +33,6 @@ struct i2c_algo_pcf_data { int (*getclock) (void *data); void (*waitforpin) (void); - /* local settings */ - int udelay; - int timeout; - /* Multi-master lost arbitration back-off delay (msecs) * This should be set by the bus adapter or knowledgable client * if bus is multi-mastered, else zero -- cgit v1.2.3-70-g09d2 From f6a7110520037ba786f17b53790c6eb8a3d4ef55 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 14 Jul 2008 22:38:34 +0200 Subject: i2c-dev: Delete empty detach_client callback Implementing detach_client is optional, so there is no point in an empty implementation. Likewise, i2c driver IDs are optional, and we don't need one. Signed-off-by: Jean Delvare --- drivers/i2c/i2c-dev.c | 7 ------- include/linux/i2c-id.h | 2 -- 2 files changed, 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c index e96d9869678..50df53640c7 100644 --- a/drivers/i2c/i2c-dev.c +++ b/drivers/i2c/i2c-dev.c @@ -548,19 +548,12 @@ static int i2cdev_detach_adapter(struct i2c_adapter *adap) return 0; } -static int i2cdev_detach_client(struct i2c_client *client) -{ - return 0; -} - static struct i2c_driver i2cdev_driver = { .driver = { .name = "dev_driver", }, - .id = I2C_DRIVERID_I2CDEV, .attach_adapter = i2cdev_attach_adapter, .detach_adapter = i2cdev_detach_adapter, - .detach_client = i2cdev_detach_client, }; /* ------------------------------------------------------------------------- */ diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h index 988e566d3ed..ef13b7c66df 100644 --- a/include/linux/i2c-id.h +++ b/include/linux/i2c-id.h @@ -91,8 +91,6 @@ #define I2C_DRIVERID_M52790 95 /* Mitsubishi M52790SP/FP AV switch */ #define I2C_DRIVERID_CS5345 96 /* cs5345 audio processor */ -#define I2C_DRIVERID_I2CDEV 900 - #define I2C_DRIVERID_OV7670 1048 /* Omnivision 7670 camera */ /* -- cgit v1.2.3-70-g09d2 From e9ca9eb9d7fc7bf3dc3cec5ba7edb089c4625f7b Mon Sep 17 00:00:00 2001 From: Jon Smirl Date: Mon, 14 Jul 2008 22:38:35 +0200 Subject: i2c: Export the i2c_bus_type symbol Export the root of the i2c bus so that PowerPC device tree code can iterate over devices on the i2c bus. Signed-off-by: Jon Smirl Signed-off-by: Jean Delvare --- drivers/i2c/i2c-core.c | 3 ++- include/linux/i2c.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index d6cc58abf3f..e45bb2838f4 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -201,7 +201,7 @@ static struct device_attribute i2c_dev_attrs[] = { { }, }; -static struct bus_type i2c_bus_type = { +struct bus_type i2c_bus_type = { .name = "i2c", .dev_attrs = i2c_dev_attrs, .match = i2c_device_match, @@ -212,6 +212,7 @@ static struct bus_type i2c_bus_type = { .suspend = i2c_device_suspend, .resume = i2c_device_resume, }; +EXPORT_SYMBOL_GPL(i2c_bus_type); /** diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 839d0ea3dca..50cbab4b62b 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -35,6 +35,8 @@ #include /* for completion */ #include +extern struct bus_type i2c_bus_type; + /* --- General options ------------------------------------------------ */ struct i2c_msg; -- cgit v1.2.3-70-g09d2 From 2b7a5056a0a7ff17d5d2004c29c852a92a6bd632 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 14 Jul 2008 22:38:35 +0200 Subject: i2c: New-style EEPROM driver using device IDs Add a new-style driver for most I2C EEPROMs, giving sysfs read/write access to their data. Tested with various chips and clock rates. Signed-off-by: Wolfram Sang Signed-off-by: Jean Delvare --- drivers/i2c/chips/Kconfig | 26 ++ drivers/i2c/chips/Makefile | 1 + drivers/i2c/chips/at24.c | 583 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/i2c/at24.h | 28 +++ 4 files changed, 638 insertions(+) create mode 100644 drivers/i2c/chips/at24.c create mode 100644 include/linux/i2c/at24.h (limited to 'include/linux') diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig index 6326468d5f0..50e0a465374 100644 --- a/drivers/i2c/chips/Kconfig +++ b/drivers/i2c/chips/Kconfig @@ -14,6 +14,32 @@ config DS1682 This driver can also be built as a module. If so, the module will be called ds1682. +config AT24 + tristate "EEPROMs from most vendors" + depends on SYSFS && EXPERIMENTAL + help + Enable this driver to get read/write support to most I2C EEPROMs, + after you configure the driver to know about each EEPROM on + your target board. Use these generic chip names, instead of + vendor-specific ones like at24c64 or 24lc02: + + 24c00, 24c01, 24c02, spd (readonly 24c02), 24c04, 24c08, + 24c16, 24c32, 24c64, 24c128, 24c256, 24c512, 24c1024 + + Unless you like data loss puzzles, always be sure that any chip + you configure as a 24c32 (32 kbit) or larger is NOT really a + 24c16 (16 kbit) or smaller, and vice versa. Marking the chip + as read-only won't help recover from this. Also, if your chip + has any software write-protect mechanism you may want to review the + code to make sure this driver won't turn it on by accident. + + If you use this with an SMBus adapter instead of an I2C adapter, + full functionality is not available. Only smaller devices are + supported (24c16 and below, max 4 kByte). + + This driver can also be built as a module. If so, the module + will be called at24. + config SENSORS_EEPROM tristate "EEPROM reader" depends on EXPERIMENTAL diff --git a/drivers/i2c/chips/Makefile b/drivers/i2c/chips/Makefile index e47aca0ca5a..39e3e69ed12 100644 --- a/drivers/i2c/chips/Makefile +++ b/drivers/i2c/chips/Makefile @@ -10,6 +10,7 @@ # obj-$(CONFIG_DS1682) += ds1682.o +obj-$(CONFIG_AT24) += at24.o obj-$(CONFIG_SENSORS_EEPROM) += eeprom.o obj-$(CONFIG_SENSORS_MAX6875) += max6875.o obj-$(CONFIG_SENSORS_PCA9539) += pca9539.o diff --git a/drivers/i2c/chips/at24.c b/drivers/i2c/chips/at24.c new file mode 100644 index 00000000000..e764c94f3e3 --- /dev/null +++ b/drivers/i2c/chips/at24.c @@ -0,0 +1,583 @@ +/* + * at24.c - handle most I2C EEPROMs + * + * Copyright (C) 2005-2007 David Brownell + * Copyright (C) 2008 Wolfram Sang, Pengutronix + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * I2C EEPROMs from most vendors are inexpensive and mostly interchangeable. + * Differences between different vendor product lines (like Atmel AT24C or + * MicroChip 24LC, etc) won't much matter for typical read/write access. + * There are also I2C RAM chips, likewise interchangeable. One example + * would be the PCF8570, which acts like a 24c02 EEPROM (256 bytes). + * + * However, misconfiguration can lose data. "Set 16-bit memory address" + * to a part with 8-bit addressing will overwrite data. Writing with too + * big a page size also loses data. And it's not safe to assume that the + * conventional addresses 0x50..0x57 only hold eeproms; a PCF8563 RTC + * uses 0x51, for just one example. + * + * Accordingly, explicit board-specific configuration data should be used + * in almost all cases. (One partial exception is an SMBus used to access + * "SPD" data for DRAM sticks. Those only use 24c02 EEPROMs.) + * + * So this driver uses "new style" I2C driver binding, expecting to be + * told what devices exist. That may be in arch/X/mach-Y/board-Z.c or + * similar kernel-resident tables; or, configuration data coming from + * a bootloader. + * + * Other than binding model, current differences from "eeprom" driver are + * that this one handles write access and isn't restricted to 24c02 devices. + * It also handles larger devices (32 kbit and up) with two-byte addresses, + * which won't work on pure SMBus systems. + */ + +struct at24_data { + struct at24_platform_data chip; + bool use_smbus; + + /* + * Lock protects against activities from other Linux tasks, + * but not from changes by other I2C masters. + */ + struct mutex lock; + struct bin_attribute bin; + + u8 *writebuf; + unsigned write_max; + unsigned num_addresses; + + /* + * Some chips tie up multiple I2C addresses; dummy devices reserve + * them for us, and we'll use them with SMBus calls. + */ + struct i2c_client *client[]; +}; + +/* + * This parameter is to help this driver avoid blocking other drivers out + * of I2C for potentially troublesome amounts of time. With a 100 kHz I2C + * clock, one 256 byte read takes about 1/43 second which is excessive; + * but the 1/170 second it takes at 400 kHz may be quite reasonable; and + * at 1 MHz (Fm+) a 1/430 second delay could easily be invisible. + * + * This value is forced to be a power of two so that writes align on pages. + */ +static unsigned io_limit = 128; +module_param(io_limit, uint, 0); +MODULE_PARM_DESC(io_limit, "Maximum bytes per I/O (default 128)"); + +/* + * Specs often allow 5 msec for a page write, sometimes 20 msec; + * it's important to recover from write timeouts. + */ +static unsigned write_timeout = 25; +module_param(write_timeout, uint, 0); +MODULE_PARM_DESC(write_timeout, "Time (in ms) to try writes (default 25)"); + +#define AT24_SIZE_BYTELEN 5 +#define AT24_SIZE_FLAGS 8 + +#define AT24_BITMASK(x) (BIT(x) - 1) + +/* create non-zero magic value for given eeprom parameters */ +#define AT24_DEVICE_MAGIC(_len, _flags) \ + ((1 << AT24_SIZE_FLAGS | (_flags)) \ + << AT24_SIZE_BYTELEN | ilog2(_len)) + +static const struct i2c_device_id at24_ids[] = { + /* needs 8 addresses as A0-A2 are ignored */ + { "24c00", AT24_DEVICE_MAGIC(128 / 8, AT24_FLAG_TAKE8ADDR) }, + /* old variants can't be handled with this generic entry! */ + { "24c01", AT24_DEVICE_MAGIC(1024 / 8, 0) }, + { "24c02", AT24_DEVICE_MAGIC(2048 / 8, 0) }, + /* spd is a 24c02 in memory DIMMs */ + { "spd", AT24_DEVICE_MAGIC(2048 / 8, + AT24_FLAG_READONLY | AT24_FLAG_IRUGO) }, + { "24c04", AT24_DEVICE_MAGIC(4096 / 8, 0) }, + /* 24rf08 quirk is handled at i2c-core */ + { "24c08", AT24_DEVICE_MAGIC(8192 / 8, 0) }, + { "24c16", AT24_DEVICE_MAGIC(16384 / 8, 0) }, + { "24c32", AT24_DEVICE_MAGIC(32768 / 8, AT24_FLAG_ADDR16) }, + { "24c64", AT24_DEVICE_MAGIC(65536 / 8, AT24_FLAG_ADDR16) }, + { "24c128", AT24_DEVICE_MAGIC(131072 / 8, AT24_FLAG_ADDR16) }, + { "24c256", AT24_DEVICE_MAGIC(262144 / 8, AT24_FLAG_ADDR16) }, + { "24c512", AT24_DEVICE_MAGIC(524288 / 8, AT24_FLAG_ADDR16) }, + { "24c1024", AT24_DEVICE_MAGIC(1048576 / 8, AT24_FLAG_ADDR16) }, + { "at24", 0 }, + { /* END OF LIST */ } +}; +MODULE_DEVICE_TABLE(i2c, at24_ids); + +/*-------------------------------------------------------------------------*/ + +/* + * This routine supports chips which consume multiple I2C addresses. It + * computes the addressing information to be used for a given r/w request. + * Assumes that sanity checks for offset happened at sysfs-layer. + */ +static struct i2c_client *at24_translate_offset(struct at24_data *at24, + unsigned *offset) +{ + unsigned i; + + if (at24->chip.flags & AT24_FLAG_ADDR16) { + i = *offset >> 16; + *offset &= 0xffff; + } else { + i = *offset >> 8; + *offset &= 0xff; + } + + return at24->client[i]; +} + +static ssize_t at24_eeprom_read(struct at24_data *at24, char *buf, + unsigned offset, size_t count) +{ + struct i2c_msg msg[2]; + u8 msgbuf[2]; + struct i2c_client *client; + int status, i; + + memset(msg, 0, sizeof(msg)); + + /* + * REVISIT some multi-address chips don't rollover page reads to + * the next slave address, so we may need to truncate the count. + * Those chips might need another quirk flag. + * + * If the real hardware used four adjacent 24c02 chips and that + * were misconfigured as one 24c08, that would be a similar effect: + * one "eeprom" file not four, but larger reads would fail when + * they crossed certain pages. + */ + + /* + * Slave address and byte offset derive from the offset. Always + * set the byte address; on a multi-master board, another master + * may have changed the chip's "current" address pointer. + */ + client = at24_translate_offset(at24, &offset); + + if (count > io_limit) + count = io_limit; + + /* Smaller eeproms can work given some SMBus extension calls */ + if (at24->use_smbus) { + if (count > I2C_SMBUS_BLOCK_MAX) + count = I2C_SMBUS_BLOCK_MAX; + status = i2c_smbus_read_i2c_block_data(client, offset, + count, buf); + dev_dbg(&client->dev, "smbus read %zd@%d --> %d\n", + count, offset, status); + return (status < 0) ? -EIO : status; + } + + /* + * When we have a better choice than SMBus calls, use a combined + * I2C message. Write address; then read up to io_limit data bytes. + * Note that read page rollover helps us here (unlike writes). + * msgbuf is u8 and will cast to our needs. + */ + i = 0; + if (at24->chip.flags & AT24_FLAG_ADDR16) + msgbuf[i++] = offset >> 8; + msgbuf[i++] = offset; + + msg[0].addr = client->addr; + msg[0].buf = msgbuf; + msg[0].len = i; + + msg[1].addr = client->addr; + msg[1].flags = I2C_M_RD; + msg[1].buf = buf; + msg[1].len = count; + + status = i2c_transfer(client->adapter, msg, 2); + dev_dbg(&client->dev, "i2c read %zd@%d --> %d\n", + count, offset, status); + + if (status == 2) + return count; + else if (status >= 0) + return -EIO; + else + return status; +} + +static ssize_t at24_bin_read(struct kobject *kobj, struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + struct at24_data *at24; + ssize_t retval = 0; + + at24 = dev_get_drvdata(container_of(kobj, struct device, kobj)); + + if (unlikely(!count)) + return count; + + /* + * Read data from chip, protecting against concurrent updates + * from this host, but not from other I2C masters. + */ + mutex_lock(&at24->lock); + + while (count) { + ssize_t status; + + status = at24_eeprom_read(at24, buf, off, count); + if (status <= 0) { + if (retval == 0) + retval = status; + break; + } + buf += status; + off += status; + count -= status; + retval += status; + } + + mutex_unlock(&at24->lock); + + return retval; +} + + +/* + * REVISIT: export at24_bin{read,write}() to let other kernel code use + * eeprom data. For example, it might hold a board's Ethernet address, or + * board-specific calibration data generated on the manufacturing floor. + */ + + +/* + * Note that if the hardware write-protect pin is pulled high, the whole + * chip is normally write protected. But there are plenty of product + * variants here, including OTP fuses and partial chip protect. + * + * We only use page mode writes; the alternative is sloooow. This routine + * writes at most one page. + */ +static ssize_t at24_eeprom_write(struct at24_data *at24, char *buf, + unsigned offset, size_t count) +{ + struct i2c_client *client; + struct i2c_msg msg; + ssize_t status; + unsigned long timeout, write_time; + unsigned next_page; + + /* Get corresponding I2C address and adjust offset */ + client = at24_translate_offset(at24, &offset); + + /* write_max is at most a page */ + if (count > at24->write_max) + count = at24->write_max; + + /* Never roll over backwards, to the start of this page */ + next_page = roundup(offset + 1, at24->chip.page_size); + if (offset + count > next_page) + count = next_page - offset; + + /* If we'll use I2C calls for I/O, set up the message */ + if (!at24->use_smbus) { + int i = 0; + + msg.addr = client->addr; + msg.flags = 0; + + /* msg.buf is u8 and casts will mask the values */ + msg.buf = at24->writebuf; + if (at24->chip.flags & AT24_FLAG_ADDR16) + msg.buf[i++] = offset >> 8; + + msg.buf[i++] = offset; + memcpy(&msg.buf[i], buf, count); + msg.len = i + count; + } + + /* + * Writes fail if the previous one didn't complete yet. We may + * loop a few times until this one succeeds, waiting at least + * long enough for one entire page write to work. + */ + timeout = jiffies + msecs_to_jiffies(write_timeout); + do { + write_time = jiffies; + if (at24->use_smbus) { + status = i2c_smbus_write_i2c_block_data(client, + offset, count, buf); + if (status == 0) + status = count; + } else { + status = i2c_transfer(client->adapter, &msg, 1); + if (status == 1) + status = count; + } + dev_dbg(&client->dev, "write %zd@%d --> %zd (%ld)\n", + count, offset, status, jiffies); + + if (status == count) + return count; + + /* REVISIT: at HZ=100, this is sloooow */ + msleep(1); + } while (time_before(write_time, timeout)); + + return -ETIMEDOUT; +} + +static ssize_t at24_bin_write(struct kobject *kobj, struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + struct at24_data *at24; + ssize_t retval = 0; + + at24 = dev_get_drvdata(container_of(kobj, struct device, kobj)); + + if (unlikely(!count)) + return count; + + /* + * Write data to chip, protecting against concurrent updates + * from this host, but not from other I2C masters. + */ + mutex_lock(&at24->lock); + + while (count) { + ssize_t status; + + status = at24_eeprom_write(at24, buf, off, count); + if (status <= 0) { + if (retval == 0) + retval = status; + break; + } + buf += status; + off += status; + count -= status; + retval += status; + } + + mutex_unlock(&at24->lock); + + return retval; +} + +/*-------------------------------------------------------------------------*/ + +static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id) +{ + struct at24_platform_data chip; + bool writable; + bool use_smbus = false; + struct at24_data *at24; + int err; + unsigned i, num_addresses; + kernel_ulong_t magic; + + if (client->dev.platform_data) { + chip = *(struct at24_platform_data *)client->dev.platform_data; + } else { + if (!id->driver_data) { + err = -ENODEV; + goto err_out; + } + magic = id->driver_data; + chip.byte_len = BIT(magic & AT24_BITMASK(AT24_SIZE_BYTELEN)); + magic >>= AT24_SIZE_BYTELEN; + chip.flags = magic & AT24_BITMASK(AT24_SIZE_FLAGS); + /* + * This is slow, but we can't know all eeproms, so we better + * play safe. Specifying custom eeprom-types via platform_data + * is recommended anyhow. + */ + chip.page_size = 1; + } + + if (!is_power_of_2(chip.byte_len)) + dev_warn(&client->dev, + "byte_len looks suspicious (no power of 2)!\n"); + if (!is_power_of_2(chip.page_size)) + dev_warn(&client->dev, + "page_size looks suspicious (no power of 2)!\n"); + + /* Use I2C operations unless we're stuck with SMBus extensions. */ + if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { + if (chip.flags & AT24_FLAG_ADDR16) { + err = -EPFNOSUPPORT; + goto err_out; + } + if (!i2c_check_functionality(client->adapter, + I2C_FUNC_SMBUS_READ_I2C_BLOCK)) { + err = -EPFNOSUPPORT; + goto err_out; + } + use_smbus = true; + } + + if (chip.flags & AT24_FLAG_TAKE8ADDR) + num_addresses = 8; + else + num_addresses = DIV_ROUND_UP(chip.byte_len, + (chip.flags & AT24_FLAG_ADDR16) ? 65536 : 256); + + at24 = kzalloc(sizeof(struct at24_data) + + num_addresses * sizeof(struct i2c_client *), GFP_KERNEL); + if (!at24) { + err = -ENOMEM; + goto err_out; + } + + mutex_init(&at24->lock); + at24->use_smbus = use_smbus; + at24->chip = chip; + at24->num_addresses = num_addresses; + + /* + * Export the EEPROM bytes through sysfs, since that's convenient. + * By default, only root should see the data (maybe passwords etc) + */ + at24->bin.attr.name = "eeprom"; + at24->bin.attr.mode = chip.flags & AT24_FLAG_IRUGO ? S_IRUGO : S_IRUSR; + at24->bin.attr.owner = THIS_MODULE; + at24->bin.read = at24_bin_read; + at24->bin.size = chip.byte_len; + + writable = !(chip.flags & AT24_FLAG_READONLY); + if (writable) { + if (!use_smbus || i2c_check_functionality(client->adapter, + I2C_FUNC_SMBUS_WRITE_I2C_BLOCK)) { + + unsigned write_max = chip.page_size; + + at24->bin.write = at24_bin_write; + at24->bin.attr.mode |= S_IWUSR; + + if (write_max > io_limit) + write_max = io_limit; + if (use_smbus && write_max > I2C_SMBUS_BLOCK_MAX) + write_max = I2C_SMBUS_BLOCK_MAX; + at24->write_max = write_max; + + /* buffer (data + address at the beginning) */ + at24->writebuf = kmalloc(write_max + 2, GFP_KERNEL); + if (!at24->writebuf) { + err = -ENOMEM; + goto err_struct; + } + } else { + dev_warn(&client->dev, + "cannot write due to controller restrictions."); + } + } + + at24->client[0] = client; + + /* use dummy devices for multiple-address chips */ + for (i = 1; i < num_addresses; i++) { + at24->client[i] = i2c_new_dummy(client->adapter, + client->addr + i); + if (!at24->client[i]) { + dev_err(&client->dev, "address 0x%02x unavailable\n", + client->addr + i); + err = -EADDRINUSE; + goto err_clients; + } + } + + err = sysfs_create_bin_file(&client->dev.kobj, &at24->bin); + if (err) + goto err_clients; + + i2c_set_clientdata(client, at24); + + dev_info(&client->dev, "%Zd byte %s EEPROM %s\n", + at24->bin.size, client->name, + writable ? "(writable)" : "(read-only)"); + dev_dbg(&client->dev, + "page_size %d, num_addresses %d, write_max %d%s\n", + chip.page_size, num_addresses, + at24->write_max, + use_smbus ? ", use_smbus" : ""); + + return 0; + +err_clients: + for (i = 1; i < num_addresses; i++) + if (at24->client[i]) + i2c_unregister_device(at24->client[i]); + + kfree(at24->writebuf); +err_struct: + kfree(at24); +err_out: + dev_dbg(&client->dev, "probe error %d\n", err); + return err; +} + +static int __devexit at24_remove(struct i2c_client *client) +{ + struct at24_data *at24; + int i; + + at24 = i2c_get_clientdata(client); + sysfs_remove_bin_file(&client->dev.kobj, &at24->bin); + + for (i = 1; i < at24->num_addresses; i++) + i2c_unregister_device(at24->client[i]); + + kfree(at24->writebuf); + kfree(at24); + i2c_set_clientdata(client, NULL); + return 0; +} + +/*-------------------------------------------------------------------------*/ + +static struct i2c_driver at24_driver = { + .driver = { + .name = "at24", + .owner = THIS_MODULE, + }, + .probe = at24_probe, + .remove = __devexit_p(at24_remove), + .id_table = at24_ids, +}; + +static int __init at24_init(void) +{ + io_limit = rounddown_pow_of_two(io_limit); + return i2c_add_driver(&at24_driver); +} +module_init(at24_init); + +static void __exit at24_exit(void) +{ + i2c_del_driver(&at24_driver); +} +module_exit(at24_exit); + +MODULE_DESCRIPTION("Driver for most I2C EEPROMs"); +MODULE_AUTHOR("David Brownell and Wolfram Sang"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/i2c/at24.h b/include/linux/i2c/at24.h new file mode 100644 index 00000000000..f6edd522a92 --- /dev/null +++ b/include/linux/i2c/at24.h @@ -0,0 +1,28 @@ +#ifndef _LINUX_AT24_H +#define _LINUX_AT24_H + +#include + +/* + * As seen through Linux I2C, differences between the most common types of I2C + * memory include: + * - How much memory is available (usually specified in bit)? + * - What write page size does it support? + * - Special flags (16 bit addresses, read_only, world readable...)? + * + * If you set up a custom eeprom type, please double-check the parameters. + * Especially page_size needs extra care, as you risk data loss if your value + * is bigger than what the chip actually supports! + */ + +struct at24_platform_data { + u32 byte_len; /* size (sum of all addr) */ + u16 page_size; /* for writes */ + u8 flags; +#define AT24_FLAG_ADDR16 0x80 /* address pointer is 16 bit */ +#define AT24_FLAG_READONLY 0x40 /* sysfs-entry will be read-only */ +#define AT24_FLAG_IRUGO 0x20 /* sysfs-entry will be world-readable */ +#define AT24_FLAG_TAKE8ADDR 0x10 /* take always 8 addresses (24c00) */ +}; + +#endif /* _LINUX_AT24_H */ -- cgit v1.2.3-70-g09d2 From 4735c98f8447acb1c8977e2b8024640f7bf36dd6 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 14 Jul 2008 22:38:36 +0200 Subject: i2c: Add detection capability to new-style drivers Add a mechanism to let new-style i2c drivers optionally autodetect devices they would support on selected buses and ask i2c-core to instantiate them. This is a replacement for legacy i2c drivers, much cleaner. Where drivers had to implement both a legacy i2c_driver and a new-style i2c_driver so far, this mechanism makes it possible to get rid of the legacy i2c_driver and implement both enumerated and detected device support with just one (new-style) i2c_driver. Here is a quick conversion guide for these drivers, step by step: * Delete the legacy driver definition, registration and removal. Delete the attach_adapter and detach_client methods of the legacy driver. * Change the prototype of the legacy detect function from static int foo_detect(struct i2c_adapter *adapter, int address, int kind); to static int foo_detect(struct i2c_client *client, int kind, struct i2c_board_info *info); * Set the new-style driver detect callback to this new function, and set its address_data to &addr_data (addr_data is generally provided by I2C_CLIENT_INSMOD.) * Add the appropriate class to the new-style driver. This is typically the class the legacy attach_adapter method was checking for. Class checking is now mandatory (done by i2c-core.) See for the list of available classes. * Remove the i2c_client allocation and freeing from the detect function. A pre-allocated client is now handed to you by i2c-core, and is freed automatically. * Make the detect function fill the type field of the i2c_board_info structure it was passed as a parameter, and return 0, on success. If the detection fails, return -ENODEV. Signed-off-by: Jean Delvare --- Documentation/i2c/writing-clients | 29 +++++ drivers/i2c/i2c-core.c | 223 ++++++++++++++++++++++++++++++++++++-- include/linux/i2c.h | 36 +++++- 3 files changed, 272 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients index 63722d3c9cd..6b61b3a2e90 100644 --- a/Documentation/i2c/writing-clients +++ b/Documentation/i2c/writing-clients @@ -44,6 +44,10 @@ static struct i2c_driver foo_driver = { .id_table = foo_ids, .probe = foo_probe, .remove = foo_remove, + /* if device autodetection is needed: */ + .class = I2C_CLASS_SOMETHING, + .detect = foo_detect, + .address_data = &addr_data, /* else, driver uses "legacy" binding model: */ .attach_adapter = foo_attach_adapter, @@ -217,6 +221,31 @@ in the I2C bus driver. You may want to save the returned i2c_client reference for later use. +Device Detection (Standard driver model) +---------------------------------------- + +Sometimes you do not know in advance which I2C devices are connected to +a given I2C bus. This is for example the case of hardware monitoring +devices on a PC's SMBus. In that case, you may want to let your driver +detect supported devices automatically. This is how the legacy model +was working, and is now available as an extension to the standard +driver model (so that we can finally get rid of the legacy model.) + +You simply have to define a detect callback which will attempt to +identify supported devices (returning 0 for supported ones and -ENODEV +for unsupported ones), a list of addresses to probe, and a device type +(or class) so that only I2C buses which may have that type of device +connected (and not otherwise enumerated) will be probed. The i2c +core will then call you back as needed and will instantiate a device +for you for every successful detection. + +Note that this mechanism is purely optional and not suitable for all +devices. You need some reliable way to identify the supported devices +(typically using device-specific, dedicated identification registers), +otherwise misdetections are likely to occur and things can get wrong +quickly. + + Device Deletion (Standard driver model) --------------------------------------- diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 5e249d75882..0a79f766101 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -42,7 +42,9 @@ static DEFINE_MUTEX(core_lock); static DEFINE_IDR(i2c_adapter_idr); -#define is_newstyle_driver(d) ((d)->probe || (d)->remove) +#define is_newstyle_driver(d) ((d)->probe || (d)->remove || (d)->detect) + +static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver); /* ------------------------------------------------------------------------- */ @@ -418,6 +420,10 @@ static int i2c_do_add_adapter(struct device_driver *d, void *data) struct i2c_driver *driver = to_i2c_driver(d); struct i2c_adapter *adap = data; + /* Detect supported devices on that bus, and instantiate them */ + i2c_detect(adap, driver); + + /* Let legacy drivers scan this bus for matching devices */ if (driver->attach_adapter) { /* We ignore the return code; if it fails, too bad */ driver->attach_adapter(adap); @@ -457,7 +463,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap) if (adap->nr < __i2c_first_dynamic_bus_num) i2c_scan_static_board_info(adap); - /* let legacy drivers scan this bus for matching devices */ + /* Notify drivers */ dummy = bus_for_each_drv(&i2c_bus_type, NULL, adap, i2c_do_add_adapter); @@ -563,8 +569,19 @@ static int i2c_do_del_adapter(struct device_driver *d, void *data) { struct i2c_driver *driver = to_i2c_driver(d); struct i2c_adapter *adapter = data; + struct i2c_client *client, *_n; int res; + /* Remove the devices we created ourselves */ + list_for_each_entry_safe(client, _n, &driver->clients, detected) { + if (client->adapter == adapter) { + dev_dbg(&adapter->dev, "Removing %s at 0x%x\n", + client->name, client->addr); + list_del(&client->detected); + i2c_unregister_device(client); + } + } + if (!driver->detach_adapter) return 0; res = driver->detach_adapter(adapter); @@ -651,7 +668,11 @@ static int __attach_adapter(struct device *dev, void *data) struct i2c_adapter *adapter = to_i2c_adapter(dev); struct i2c_driver *driver = data; - driver->attach_adapter(adapter); + i2c_detect(adapter, driver); + + /* Legacy drivers scan i2c busses directly */ + if (driver->attach_adapter) + driver->attach_adapter(adapter); return 0; } @@ -695,10 +716,9 @@ int i2c_register_driver(struct module *owner, struct i2c_driver *driver) pr_debug("i2c-core: driver [%s] registered\n", driver->driver.name); - /* legacy drivers scan i2c busses directly */ - if (driver->attach_adapter) - class_for_each_device(&i2c_adapter_class, driver, - __attach_adapter); + INIT_LIST_HEAD(&driver->clients); + /* Walk the adapters that are already present */ + class_for_each_device(&i2c_adapter_class, driver, __attach_adapter); mutex_unlock(&core_lock); return 0; @@ -709,6 +729,17 @@ static int __detach_adapter(struct device *dev, void *data) { struct i2c_adapter *adapter = to_i2c_adapter(dev); struct i2c_driver *driver = data; + struct i2c_client *client, *_n; + + list_for_each_entry_safe(client, _n, &driver->clients, detected) { + dev_dbg(&adapter->dev, "Removing %s at 0x%x\n", + client->name, client->addr); + list_del(&client->detected); + i2c_unregister_device(client); + } + + if (is_newstyle_driver(driver)) + return 0; /* Have a look at each adapter, if clients of this driver are still * attached. If so, detach them to be able to kill the driver @@ -747,10 +778,7 @@ void i2c_del_driver(struct i2c_driver *driver) { mutex_lock(&core_lock); - /* legacy driver? */ - if (!is_newstyle_driver(driver)) - class_for_each_device(&i2c_adapter_class, driver, - __detach_adapter); + class_for_each_device(&i2c_adapter_class, driver, __detach_adapter); driver_unregister(&driver->driver); pr_debug("i2c-core: driver [%s] unregistered\n", driver->driver.name); @@ -1205,6 +1233,179 @@ int i2c_probe(struct i2c_adapter *adapter, } EXPORT_SYMBOL(i2c_probe); +/* Separate detection function for new-style drivers */ +static int i2c_detect_address(struct i2c_client *temp_client, int kind, + struct i2c_driver *driver) +{ + struct i2c_board_info info; + struct i2c_adapter *adapter = temp_client->adapter; + int addr = temp_client->addr; + int err; + + /* Make sure the address is valid */ + if (addr < 0x03 || addr > 0x77) { + dev_warn(&adapter->dev, "Invalid probe address 0x%02x\n", + addr); + return -EINVAL; + } + + /* Skip if already in use */ + if (i2c_check_addr(adapter, addr)) + return 0; + + /* Make sure there is something at this address, unless forced */ + if (kind < 0) { + if (i2c_smbus_xfer(adapter, addr, 0, 0, 0, + I2C_SMBUS_QUICK, NULL) < 0) + return 0; + + /* prevent 24RF08 corruption */ + if ((addr & ~0x0f) == 0x50) + i2c_smbus_xfer(adapter, addr, 0, 0, 0, + I2C_SMBUS_QUICK, NULL); + } + + /* Finally call the custom detection function */ + memset(&info, 0, sizeof(struct i2c_board_info)); + info.addr = addr; + err = driver->detect(temp_client, kind, &info); + if (err) { + /* -ENODEV is returned if the detection fails. We catch it + here as this isn't an error. */ + return err == -ENODEV ? 0 : err; + } + + /* Consistency check */ + if (info.type[0] == '\0') { + dev_err(&adapter->dev, "%s detection function provided " + "no name for 0x%x\n", driver->driver.name, + addr); + } else { + struct i2c_client *client; + + /* Detection succeeded, instantiate the device */ + dev_dbg(&adapter->dev, "Creating %s at 0x%02x\n", + info.type, info.addr); + client = i2c_new_device(adapter, &info); + if (client) + list_add_tail(&client->detected, &driver->clients); + else + dev_err(&adapter->dev, "Failed creating %s at 0x%02x\n", + info.type, info.addr); + } + return 0; +} + +static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver) +{ + const struct i2c_client_address_data *address_data; + struct i2c_client *temp_client; + int i, err = 0; + int adap_id = i2c_adapter_id(adapter); + + address_data = driver->address_data; + if (!driver->detect || !address_data) + return 0; + + /* Set up a temporary client to help detect callback */ + temp_client = kzalloc(sizeof(struct i2c_client), GFP_KERNEL); + if (!temp_client) + return -ENOMEM; + temp_client->adapter = adapter; + + /* Force entries are done first, and are not affected by ignore + entries */ + if (address_data->forces) { + const unsigned short * const *forces = address_data->forces; + int kind; + + for (kind = 0; forces[kind]; kind++) { + for (i = 0; forces[kind][i] != I2C_CLIENT_END; + i += 2) { + if (forces[kind][i] == adap_id + || forces[kind][i] == ANY_I2C_BUS) { + dev_dbg(&adapter->dev, "found force " + "parameter for adapter %d, " + "addr 0x%02x, kind %d\n", + adap_id, forces[kind][i + 1], + kind); + temp_client->addr = forces[kind][i + 1]; + err = i2c_detect_address(temp_client, + kind, driver); + if (err) + goto exit_free; + } + } + } + } + + /* Stop here if we can't use SMBUS_QUICK */ + if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_QUICK)) { + if (address_data->probe[0] == I2C_CLIENT_END + && address_data->normal_i2c[0] == I2C_CLIENT_END) + goto exit_free; + + dev_warn(&adapter->dev, "SMBus Quick command not supported, " + "can't probe for chips\n"); + err = -EOPNOTSUPP; + goto exit_free; + } + + /* Stop here if the classes do not match */ + if (!(adapter->class & driver->class)) + goto exit_free; + + /* Probe entries are done second, and are not affected by ignore + entries either */ + for (i = 0; address_data->probe[i] != I2C_CLIENT_END; i += 2) { + if (address_data->probe[i] == adap_id + || address_data->probe[i] == ANY_I2C_BUS) { + dev_dbg(&adapter->dev, "found probe parameter for " + "adapter %d, addr 0x%02x\n", adap_id, + address_data->probe[i + 1]); + temp_client->addr = address_data->probe[i + 1]; + err = i2c_detect_address(temp_client, -1, driver); + if (err) + goto exit_free; + } + } + + /* Normal entries are done last, unless shadowed by an ignore entry */ + for (i = 0; address_data->normal_i2c[i] != I2C_CLIENT_END; i += 1) { + int j, ignore; + + ignore = 0; + for (j = 0; address_data->ignore[j] != I2C_CLIENT_END; + j += 2) { + if ((address_data->ignore[j] == adap_id || + address_data->ignore[j] == ANY_I2C_BUS) + && address_data->ignore[j + 1] + == address_data->normal_i2c[i]) { + dev_dbg(&adapter->dev, "found ignore " + "parameter for adapter %d, " + "addr 0x%02x\n", adap_id, + address_data->ignore[j + 1]); + ignore = 1; + break; + } + } + if (ignore) + continue; + + dev_dbg(&adapter->dev, "found normal entry for adapter %d, " + "addr 0x%02x\n", adap_id, + address_data->normal_i2c[i]); + temp_client->addr = address_data->normal_i2c[i]; + err = i2c_detect_address(temp_client, -1, driver); + if (err) + goto exit_free; + } + + exit_free: + kfree(temp_client); + return err; +} + struct i2c_client * i2c_new_probed_device(struct i2c_adapter *adap, struct i2c_board_info *info, diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 50cbab4b62b..08be0d21864 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -45,6 +45,7 @@ struct i2c_adapter; struct i2c_client; struct i2c_driver; union i2c_smbus_data; +struct i2c_board_info; /* * The master routines are the ones normally used to transmit data to devices @@ -94,15 +95,33 @@ extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client * client, u8 command, u8 length, const u8 *values); -/* - * A driver is capable of handling one or more physical devices present on - * I2C adapters. This information is used to inform the driver of adapter - * events. +/** + * struct i2c_driver - represent an I2C device driver + * @class: What kind of i2c device we instantiate (for detect) + * @detect: Callback for device detection + * @address_data: The I2C addresses to probe, ignore or force (for detect) + * @clients: List of detected clients we created (for i2c-core use only) * * The driver.owner field should be set to the module owner of this driver. * The driver.name field should be set to the name of this driver. + * + * For automatic device detection, both @detect and @address_data must + * be defined. @class should also be set, otherwise only devices forced + * with module parameters will be created. The detect function must + * fill at least the name field of the i2c_board_info structure it is + * handed upon successful detection, and possibly also the flags field. + * + * If @detect is missing, the driver will still work fine for enumerated + * devices. Detected devices simply won't be supported. This is expected + * for the many I2C/SMBus devices which can't be detected reliably, and + * the ones which can always be enumerated in practice. + * + * The i2c_client structure which is handed to the @detect callback is + * not a real i2c_client. It is initialized just enough so that you can + * call i2c_smbus_read_byte_data and friends on it. Don't do anything + * else with it. In particular, calling dev_dbg and friends on it is + * not allowed. */ - struct i2c_driver { int id; unsigned int class; @@ -142,6 +161,11 @@ struct i2c_driver { struct device_driver driver; const struct i2c_device_id *id_table; + + /* Device detection callback for automatic device creation */ + int (*detect)(struct i2c_client *, int kind, struct i2c_board_info *); + const struct i2c_client_address_data *address_data; + struct list_head clients; }; #define to_i2c_driver(d) container_of(d, struct i2c_driver, driver) @@ -157,6 +181,7 @@ struct i2c_driver { * @dev: Driver model device node for the slave. * @irq: indicates the IRQ generated by this device (if any) * @list: list of active/busy clients (DEPRECATED) + * @detected: member of an i2c_driver.clients list * @released: used to synchronize client releases & detaches and references * * An i2c_client identifies a single device (i.e. chip) connected to an @@ -174,6 +199,7 @@ struct i2c_client { struct device dev; /* the device structure */ int irq; /* irq issued by device */ struct list_head list; /* DEPRECATED */ + struct list_head detected; struct completion released; }; #define to_i2c_client(d) container_of(d, struct i2c_client, dev) -- cgit v1.2.3-70-g09d2 From 521e575b9a7324a0bca762622139f69582a042bf Mon Sep 17 00:00:00 2001 From: Ron Livne Date: Mon, 14 Jul 2008 23:48:48 -0700 Subject: IB/mlx4: Add support for blocking multicast loopback packets Add support for handling the IB_QP_CREATE_MULTICAST_BLOCK_LOOPBACK flag by using the per-multicast group loopback blocking feature of mlx4 hardware. Signed-off-by: Ron Livne Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 7 +++++-- drivers/infiniband/hw/mlx4/mlx4_ib.h | 3 ++- drivers/infiniband/hw/mlx4/qp.c | 21 ++++++++++++++++++--- drivers/net/mlx4/mcg.c | 17 +++++++++++++---- include/linux/mlx4/device.h | 3 ++- 5 files changed, 40 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 4d61e32866c..bcf50648fa1 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -90,7 +90,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | - IB_DEVICE_RC_RNR_NAK_GEN; + IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) @@ -437,7 +438,9 @@ static int mlx4_ib_dealloc_pd(struct ib_pd *pd) static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return mlx4_multicast_attach(to_mdev(ibqp->device)->dev, - &to_mqp(ibqp)->mqp, gid->raw); + &to_mqp(ibqp)->mqp, gid->raw, + !!(to_mqp(ibqp)->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)); } static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 5cf994794d2..c4cf5b69eef 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -101,7 +101,8 @@ struct mlx4_ib_wq { }; enum mlx4_ib_qp_flags { - MLX4_IB_QP_LSO = 1 << 0 + MLX4_IB_QP_LSO = 1 << 0, + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 1, }; struct mlx4_ib_qp { diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 44bbd6c2e31..91590e7fba0 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -511,6 +511,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, } else { qp->sq_no_prefetch = 0; + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; + if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) qp->flags |= MLX4_IB_QP_LSO; @@ -684,10 +687,15 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct mlx4_ib_qp *qp; int err; - /* We only support LSO, and only for kernel UD QPs. */ - if (init_attr->create_flags & ~IB_QP_CREATE_IPOIB_UD_LSO) + /* + * We only support LSO and multicast loopback blocking, and + * only for kernel UD QPs. + */ + if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO | + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) return ERR_PTR(-EINVAL); - if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO && + + if (init_attr->create_flags && (pd->uobject || init_attr->qp_type != IB_QPT_UD)) return ERR_PTR(-EINVAL); @@ -1844,6 +1852,13 @@ done: qp_init_attr->cap = qp_attr->cap; + qp_init_attr->create_flags = 0; + if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (qp->flags & MLX4_IB_QP_LSO) + qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + out: mutex_unlock(&qp->mutex); return err; diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c index 57f7f1f0d4e..b4b57870ddf 100644 --- a/drivers/net/mlx4/mcg.c +++ b/drivers/net/mlx4/mcg.c @@ -38,6 +38,9 @@ #include "mlx4.h" +#define MGM_QPN_MASK 0x00FFFFFF +#define MGM_BLCK_LB_BIT 30 + struct mlx4_mgm { __be32 next_gid_index; __be32 members_count; @@ -153,7 +156,8 @@ static int find_mgm(struct mlx4_dev *dev, return err; } -int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]) +int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *mailbox; @@ -202,13 +206,18 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]) } for (i = 0; i < members_count; ++i) - if (mgm->qp[i] == cpu_to_be32(qp->qpn)) { + if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) { mlx4_dbg(dev, "QP %06x already a member of MGM\n", qp->qpn); err = 0; goto out; } - mgm->qp[members_count++] = cpu_to_be32(qp->qpn); + if (block_mcast_loopback) + mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) | + (1 << MGM_BLCK_LB_BIT)); + else + mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK); + mgm->members_count = cpu_to_be32(members_count); err = mlx4_WRITE_MCG(dev, index, mailbox); @@ -283,7 +292,7 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]) members_count = be32_to_cpu(mgm->members_count); for (loc = -1, i = 0; i < members_count; ++i) - if (mgm->qp[i] == cpu_to_be32(qp->qpn)) + if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) loc = i; if (loc == -1) { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index a744383d16e..81b3dd5206e 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -398,7 +398,8 @@ int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_waterm int mlx4_INIT_PORT(struct mlx4_dev *dev, int port); int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port); -int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]); +int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback); int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]); int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, -- cgit v1.2.3-70-g09d2 From 124cafc5eb973e748c4ce3dc1caad29274e64613 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 15 Jul 2008 21:21:44 +0200 Subject: ide: remove ide_init_drive_cmd ide_init_drive_cmd just calls blk_rq_init. This converts the users of ide_init_drive_cmd to use blk_rq_init directly and removes ide_init_drive_cmd. Signed-off-by: FUJITA Tomonori Cc: Borislav Petkov Cc: Jens Axboe Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 2 +- drivers/ide/ide-floppy.c | 2 +- drivers/ide/ide-io.c | 17 ----------------- drivers/scsi/ide-scsi.c | 4 ++-- include/linux/ide.h | 2 -- 5 files changed, 4 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 792a3cf73d6..7917cd57644 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -193,7 +193,7 @@ void ide_cd_init_rq(ide_drive_t *drive, struct request *rq) { struct cdrom_info *cd = drive->driver_data; - ide_init_drive_cmd(rq); + blk_rq_init(NULL, rq); rq->cmd_type = REQ_TYPE_ATA_PC; rq->rq_disk = cd->disk; } diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index b10e9a813cd..9161cd92a84 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -286,7 +286,7 @@ static void idefloppy_queue_pc_head(ide_drive_t *drive, struct ide_atapi_pc *pc, { struct ide_floppy_obj *floppy = drive->driver_data; - ide_init_drive_cmd(rq); + blk_rq_init(NULL, rq); rq->buffer = (char *) pc; rq->cmd_type = REQ_TYPE_SPECIAL; rq->cmd_flags |= REQ_PREEMPT; diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 29f5cc863f6..d8b4d9f81ae 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -1538,23 +1538,6 @@ irqreturn_t ide_intr (int irq, void *dev_id) return IRQ_HANDLED; } -/** - * ide_init_drive_cmd - initialize a drive command request - * @rq: request object - * - * Initialize a request before we fill it in and send it down to - * ide_do_drive_cmd. Commands must be set up by this function. Right - * now it doesn't do a lot, but if that changes abusers will have a - * nasty surprise. - */ - -void ide_init_drive_cmd (struct request *rq) -{ - blk_rq_init(NULL, rq); -} - -EXPORT_SYMBOL(ide_init_drive_cmd); - /** * ide_do_drive_cmd - issue IDE special command * @drive: device to issue command diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index 89ecf013219..da261806d62 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -228,7 +228,7 @@ static int idescsi_check_condition(ide_drive_t *drive, kfree(pc); return -ENOMEM; } - ide_init_drive_cmd(rq); + blk_rq_init(NULL, rq); rq->special = (char *) pc; pc->rq = rq; pc->buf = buf; @@ -786,7 +786,7 @@ static int idescsi_queue (struct scsi_cmnd *cmd, } } - ide_init_drive_cmd (rq); + blk_rq_init(NULL, rq); rq->special = (char *) pc; rq->cmd_type = REQ_TYPE_SPECIAL; spin_unlock_irq(host->host_lock); diff --git a/include/linux/ide.h b/include/linux/ide.h index eddb6daadf4..3261c669175 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -857,8 +857,6 @@ int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long); extern ide_startstop_t ide_do_reset (ide_drive_t *); -extern void ide_init_drive_cmd (struct request *rq); - /* * "action" parameter type for ide_do_drive_cmd() below. */ -- cgit v1.2.3-70-g09d2 From 681a561b7ec7fdcd8f35b68e44ac6d6c70aecc04 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 15 Jul 2008 21:21:45 +0200 Subject: block: unexport blk_end_sync_rq All the users of blk_end_sync_rq has gone (they are converted to use blk_execute_rq). This unexports blk_end_sync_rq. Signed-off-by: FUJITA Tomonori Cc: Borislav Petkov Signed-off-by: Jens Axboe Signed-off-by: Bartlomiej Zolnierkiewicz --- block/blk-exec.c | 3 +-- include/linux/blkdev.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/blk-exec.c b/block/blk-exec.c index 4f52f279205..9bceff7674f 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -18,7 +18,7 @@ * @rq: request to complete * @error: end io status of the request */ -void blk_end_sync_rq(struct request *rq, int error) +static void blk_end_sync_rq(struct request *rq, int error) { struct completion *waiting = rq->end_io_data; @@ -31,7 +31,6 @@ void blk_end_sync_rq(struct request *rq, int error) */ complete(waiting); } -EXPORT_SYMBOL(blk_end_sync_rq); /** * blk_execute_rq_nowait - insert a request into queue for execution diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d2a1b71e93c..1171abd7eb1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -623,7 +623,6 @@ extern void generic_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); -extern void blk_end_sync_rq(struct request *rq, int error); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); extern void blk_insert_request(struct request_queue *, struct request *, int, void *); extern void blk_requeue_request(struct request_queue *, struct request *); -- cgit v1.2.3-70-g09d2 From 30e5ee4d1a651a0c66e86c6612c003034bd20ba2 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:46 +0200 Subject: ide: remove obsoleted "idebus=" kernel parameter * Remove obsoleted "idebus=" kernel parameter. * Remove no longer needed ide_system_bus_speed() and system_bus_clock() (together with idebus_parameter and system_bus_speed variables). Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide.c | 71 +------------------------------------------- drivers/ide/legacy/ali14xx.c | 2 +- drivers/ide/legacy/ht6560b.c | 2 +- drivers/ide/legacy/qd65xx.c | 4 +-- drivers/ide/pci/aec62xx.c | 2 +- drivers/ide/pci/alim15x3.c | 2 +- drivers/ide/pci/amd74xx.c | 2 +- drivers/ide/pci/cmd640.c | 8 ++--- drivers/ide/pci/cmd64x.c | 4 +-- drivers/ide/pci/cy82c693.c | 2 +- drivers/ide/pci/via82cxxx.c | 2 +- include/linux/ide.h | 2 -- 12 files changed, 15 insertions(+), 88 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c index 8823df1b871..f65be738b16 100644 --- a/drivers/ide/ide.c +++ b/drivers/ide/ide.c @@ -86,9 +86,6 @@ static const u8 ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR, IDE6_MAJOR, IDE7_MAJOR, IDE8_MAJOR, IDE9_MAJOR }; -static int idebus_parameter; /* holds the "idebus=" parameter */ -static int system_bus_speed; /* holds what we think is VESA/PCI bus speed */ - DEFINE_MUTEX(ide_cfg_mtx); __cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock); @@ -189,38 +186,6 @@ static void __init init_ide_data (void) } } -/** - * ide_system_bus_speed - guess bus speed - * - * ide_system_bus_speed() returns what we think is the system VESA/PCI - * bus speed (in MHz). This is used for calculating interface PIO timings. - * The default is 40 for known PCI systems, 50 otherwise. - * The "idebus=xx" parameter can be used to override this value. - * The actual value to be used is computed/displayed the first time - * through. Drivers should only use this as a last resort. - * - * Returns a guessed speed in MHz. - */ - -static int ide_system_bus_speed(void) -{ -#ifdef CONFIG_PCI - static struct pci_device_id pci_default[] = { - { PCI_DEVICE(PCI_ANY_ID, PCI_ANY_ID) }, - { } - }; -#else -#define pci_default 0 -#endif /* CONFIG_PCI */ - - /* user supplied value */ - if (idebus_parameter) - return idebus_parameter; - - /* safe default value for PCI or VESA and PCI*/ - return pci_dev_present(pci_default) ? 33 : 50; -} - void ide_remove_port_from_hwgroup(ide_hwif_t *hwif) { ide_hwgroup_t *hwgroup = hwif->hwgroup; @@ -540,20 +505,6 @@ static int set_unmaskirq(ide_drive_t *drive, int arg) return 0; } -/** - * system_bus_clock - clock guess - * - * External version of the bus clock guess used by very old IDE drivers - * for things like VLB timings. Should not be used. - */ - -int system_bus_clock (void) -{ - return system_bus_speed; -} - -EXPORT_SYMBOL(system_bus_clock); - static int generic_ide_suspend(struct device *dev, pm_message_t mesg) { ide_drive_t *drive = dev->driver_data; @@ -851,7 +802,7 @@ static int __init ide_setup(char *s) if (strncmp(s,"hd",2) == 0 && s[2] == '=') /* hd= is for hd.c */ return 0; /* driver and not us */ - if (strncmp(s,"ide",3) && strncmp(s,"idebus",6) && strncmp(s,"hd",2)) + if (strncmp(s, "ide", 3) && strncmp(s, "hd", 2)) return 0; printk(KERN_INFO "ide_setup: %s", s); @@ -951,21 +902,6 @@ static int __init ide_setup(char *s) } } - if (s[0] != 'i' || s[1] != 'd' || s[2] != 'e') - goto bad_option; - /* - * Look for bus speed option: "idebus=" - */ - if (s[3] == 'b' && s[4] == 'u' && s[5] == 's') { - if (match_parm(&s[6], NULL, vals, 1) != 1) - goto bad_option; - if (vals[0] >= 20 && vals[0] <= 66) { - idebus_parameter = vals[0]; - } else - printk(" -- BAD BUS SPEED! Expected value from 20 to 66"); - goto obsolete_option; - } - bad_option: printk(" -- BAD OPTION\n"); return 1; @@ -1287,11 +1223,6 @@ static int __init ide_init(void) int ret; printk(KERN_INFO "Uniform Multi-Platform E-IDE driver\n"); - system_bus_speed = ide_system_bus_speed(); - - printk(KERN_INFO "ide: Assuming %dMHz system bus speed " - "for PIO modes%s\n", system_bus_speed, - idebus_parameter ? "" : "; override with idebus=xx"); ret = bus_register(&ide_bus_type); if (ret < 0) { diff --git a/drivers/ide/legacy/ali14xx.c b/drivers/ide/legacy/ali14xx.c index 90c65cf9744..052125fafcf 100644 --- a/drivers/ide/legacy/ali14xx.c +++ b/drivers/ide/legacy/ali14xx.c @@ -116,7 +116,7 @@ static void ali14xx_set_pio_mode(ide_drive_t *drive, const u8 pio) int time1, time2; u8 param1, param2, param3, param4; unsigned long flags; - int bus_speed = ide_vlb_clk ? ide_vlb_clk : system_bus_clock(); + int bus_speed = ide_vlb_clk ? ide_vlb_clk : 50; /* calculate timing, according to PIO mode */ time1 = ide_pio_cycle_time(drive, pio); diff --git a/drivers/ide/legacy/ht6560b.c b/drivers/ide/legacy/ht6560b.c index 4fe516df9f7..dd6dfb32e85 100644 --- a/drivers/ide/legacy/ht6560b.c +++ b/drivers/ide/legacy/ht6560b.c @@ -212,7 +212,7 @@ static u8 ht_pio2timings(ide_drive_t *drive, const u8 pio) { int active_time, recovery_time; int active_cycles, recovery_cycles; - int bus_speed = ide_vlb_clk ? ide_vlb_clk : system_bus_clock(); + int bus_speed = ide_vlb_clk ? ide_vlb_clk : 50; if (pio) { unsigned int cycle_time; diff --git a/drivers/ide/legacy/qd65xx.c b/drivers/ide/legacy/qd65xx.c index 6424af15432..51dba82f881 100644 --- a/drivers/ide/legacy/qd65xx.c +++ b/drivers/ide/legacy/qd65xx.c @@ -110,7 +110,7 @@ static void qd65xx_select(ide_drive_t *drive) static u8 qd6500_compute_timing (ide_hwif_t *hwif, int active_time, int recovery_time) { - int clk = ide_vlb_clk ? ide_vlb_clk : system_bus_clock(); + int clk = ide_vlb_clk ? ide_vlb_clk : 50; u8 act_cyc, rec_cyc; if (clk <= 33) { @@ -132,7 +132,7 @@ static u8 qd6500_compute_timing (ide_hwif_t *hwif, int active_time, int recovery static u8 qd6580_compute_timing (int active_time, int recovery_time) { - int clk = ide_vlb_clk ? ide_vlb_clk : system_bus_clock(); + int clk = ide_vlb_clk ? ide_vlb_clk : 50; u8 act_cyc, rec_cyc; act_cyc = 17 - IDE_IN(active_time * clk / 1000 + 1, 2, 17); diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c index 7f46c224b7c..ae7a4329a58 100644 --- a/drivers/ide/pci/aec62xx.c +++ b/drivers/ide/pci/aec62xx.c @@ -140,7 +140,7 @@ static void aec_set_pio_mode(ide_drive_t *drive, const u8 pio) static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const char *name) { - int bus_speed = ide_pci_clk ? ide_pci_clk : system_bus_clock(); + int bus_speed = ide_pci_clk ? ide_pci_clk : 33; if (bus_speed <= 33) pci_set_drvdata(dev, (void *) aec6xxx_33_base); diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c index f2129d5e07f..f2de00adf14 100644 --- a/drivers/ide/pci/alim15x3.c +++ b/drivers/ide/pci/alim15x3.c @@ -72,7 +72,7 @@ static void ali_set_pio_mode(ide_drive_t *drive, const u8 pio) int s_time, a_time, c_time; u8 s_clc, a_clc, r_clc; unsigned long flags; - int bus_speed = ide_pci_clk ? ide_pci_clk : system_bus_clock(); + int bus_speed = ide_pci_clk ? ide_pci_clk : 33; int port = hwif->channel ? 0x5c : 0x58; int portFIFO = hwif->channel ? 0x55 : 0x54; u8 cd_dma_fifo = 0; diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c index a373101747b..ad222206a42 100644 --- a/drivers/ide/pci/amd74xx.c +++ b/drivers/ide/pci/amd74xx.c @@ -179,7 +179,7 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev, * Determine the system bus clock. */ - amd_clock = (ide_pci_clk ? ide_pci_clk : system_bus_clock()) * 1000; + amd_clock = (ide_pci_clk ? ide_pci_clk : 33) * 1000; switch (amd_clock) { case 33000: amd_clock = 33333; break; diff --git a/drivers/ide/pci/cmd640.c b/drivers/ide/pci/cmd640.c index b38a1980dcd..cd1ba14984a 100644 --- a/drivers/ide/pci/cmd640.c +++ b/drivers/ide/pci/cmd640.c @@ -525,12 +525,10 @@ static void cmd640_set_mode(ide_drive_t *drive, unsigned int index, u8 setup_count, active_count, recovery_count, recovery_count2, cycle_count; int bus_speed; - if (cmd640_vlb && ide_vlb_clk) - bus_speed = ide_vlb_clk; - else if (!cmd640_vlb && ide_pci_clk) - bus_speed = ide_pci_clk; + if (cmd640_vlb) + bus_speed = ide_vlb_clk ? ide_vlb_clk : 50; else - bus_speed = system_bus_clock(); + bus_speed = ide_pci_clk ? ide_pci_clk : 33; if (pio_mode > 5) pio_mode = 5; diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c index 08674711d08..ca4774aa27e 100644 --- a/drivers/ide/pci/cmd64x.c +++ b/drivers/ide/pci/cmd64x.c @@ -69,7 +69,7 @@ static u8 quantize_timing(int timing, int quant) static void program_cycle_times (ide_drive_t *drive, int cycle_time, int active_time) { struct pci_dev *dev = to_pci_dev(drive->hwif->dev); - int clock_time = 1000 / (ide_pci_clk ? ide_pci_clk : system_bus_clock()); + int clock_time = 1000 / (ide_pci_clk ? ide_pci_clk : 33); u8 cycle_count, active_count, recovery_count, drwtim; static const u8 recovery_values[] = {15, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0}; @@ -128,7 +128,7 @@ static void cmd64x_tune_pio(ide_drive_t *drive, const u8 pio) ide_pio_timings[pio].active_time); setup_count = quantize_timing(ide_pio_timings[pio].setup_time, - 1000 / (ide_pci_clk ? ide_pci_clk : system_bus_clock())); + 1000 / (ide_pci_clk ? ide_pci_clk : 33)); /* * The primary channel has individual address setup timing registers diff --git a/drivers/ide/pci/cy82c693.c b/drivers/ide/pci/cy82c693.c index 77cc22c2ad4..8c534afcb6c 100644 --- a/drivers/ide/pci/cy82c693.c +++ b/drivers/ide/pci/cy82c693.c @@ -134,7 +134,7 @@ static int calc_clk(int time, int bus_speed) static void compute_clocks(u8 pio, pio_clocks_t *p_pclk) { int clk1, clk2; - int bus_speed = ide_pci_clk ? ide_pci_clk : system_bus_clock(); + int bus_speed = ide_pci_clk ? ide_pci_clk : 33; /* we don't check against CY82C693's min and max speed, * so you can play with the idebus=xx parameter diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c index e8c2570003f..3ed9728abd2 100644 --- a/drivers/ide/pci/via82cxxx.c +++ b/drivers/ide/pci/via82cxxx.c @@ -340,7 +340,7 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const * Determine system bus clock. */ - via_clock = (ide_pci_clk ? ide_pci_clk : system_bus_clock()) * 1000; + via_clock = (ide_pci_clk ? ide_pci_clk : 33) * 1000; switch (via_clock) { case 33000: via_clock = 33333; break; diff --git a/include/linux/ide.h b/include/linux/ide.h index 3261c669175..dad53565924 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -994,8 +994,6 @@ int ide_taskfile_ioctl(ide_drive_t *, unsigned int, unsigned long); int ide_cmd_ioctl(ide_drive_t *, unsigned int, unsigned long); int ide_task_ioctl(ide_drive_t *, unsigned int, unsigned long); -extern int system_bus_clock(void); - extern int ide_driveid_update(ide_drive_t *); extern int ide_config_drive_speed(ide_drive_t *, u8); extern u8 eighty_ninty_three (ide_drive_t *); -- cgit v1.2.3-70-g09d2 From 931ee0dc5c69e8113233d21942681ab8fecde7f9 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:47 +0200 Subject: ide: remove obsoleted "ide=" kernel parameters * Remove obsoleted "ide=" kernel parameters. * Remove no longer needed: - ide_setup() - parse_options() - __setup("", ...) - module_param(options, ...) * Use module_{init,exit}() for MODULE=y case and remove MODULE ifdef. * Make ide_*acpi* and ide_doubler variables static. Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-acpi.c | 6 +-- drivers/ide/ide-dma.c | 2 +- drivers/ide/ide.c | 91 +++------------------------------------------- drivers/ide/legacy/gayle.c | 4 +- include/linux/ide.h | 4 -- 5 files changed, 10 insertions(+), 97 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c index 9d3601fa568..6f704628c27 100644 --- a/drivers/ide/ide-acpi.c +++ b/drivers/ide/ide-acpi.c @@ -60,15 +60,15 @@ struct ide_acpi_hwif_link { #define DEBPRINT(fmt, args...) do {} while (0) #endif /* DEBUGGING */ -int ide_noacpi; +static int ide_noacpi; module_param_named(noacpi, ide_noacpi, bool, 0); MODULE_PARM_DESC(noacpi, "disable IDE ACPI support"); -int ide_acpigtf; +static int ide_acpigtf; module_param_named(acpigtf, ide_acpigtf, bool, 0); MODULE_PARM_DESC(acpigtf, "enable IDE ACPI _GTF support"); -int ide_acpionboot; +static int ide_acpionboot; module_param_named(acpionboot, ide_acpionboot, bool, 0); MODULE_PARM_DESC(acpionboot, "call IDE ACPI methods on boot"); diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index 653b1ade13d..174f4704614 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -692,7 +692,7 @@ static int ide_tune_dma(ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; u8 speed; - if (noautodma || drive->nodma || (drive->id->capability & 1) == 0) + if (drive->nodma || (drive->id->capability & 1) == 0) return 0; /* consult the list of known "bad" drives */ diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c index e8c88ff2f6b..1defba3eefe 100644 --- a/drivers/ide/ide.c +++ b/drivers/ide/ide.c @@ -87,9 +87,9 @@ static const u8 ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR, IDE8_MAJOR, IDE9_MAJOR }; DEFINE_MUTEX(ide_cfg_mtx); - __cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock); -int noautodma = 0; +__cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock); +EXPORT_SYMBOL(ide_lock); ide_hwif_t ide_hwifs[MAX_HWIFS]; /* master data repository */ @@ -698,59 +698,6 @@ set_val: EXPORT_SYMBOL(generic_ide_ioctl); -/* - * ide_setup() gets called VERY EARLY during initialization, - * to handle kernel "command line" strings beginning with "ide". - * - * Remember to update Documentation/ide/ide.txt if you change something here. - */ -static int __init ide_setup(char *s) -{ - printk(KERN_INFO "ide_setup: %s", s); - -#ifdef CONFIG_BLK_DEV_IDEDOUBLER - if (!strcmp(s, "ide=doubler")) { - extern int ide_doubler; - - printk(" : Enabled support for IDE doublers\n"); - ide_doubler = 1; - goto obsolete_option; - } -#endif /* CONFIG_BLK_DEV_IDEDOUBLER */ - - if (!strcmp(s, "ide=nodma")) { - printk(" : Prevented DMA\n"); - noautodma = 1; - goto obsolete_option; - } - -#ifdef CONFIG_BLK_DEV_IDEACPI - if (!strcmp(s, "ide=noacpi")) { - //printk(" : Disable IDE ACPI support.\n"); - ide_noacpi = 1; - goto obsolete_option; - } - if (!strcmp(s, "ide=acpigtf")) { - //printk(" : Enable IDE ACPI _GTF support.\n"); - ide_acpigtf = 1; - goto obsolete_option; - } - if (!strcmp(s, "ide=acpionboot")) { - //printk(" : Call IDE ACPI methods on boot.\n"); - ide_acpionboot = 1; - goto obsolete_option; - } -#endif /* CONFIG_BLK_DEV_IDEACPI */ - - printk(" -- BAD OPTION\n"); - return 1; -obsolete_option: - printk(" -- OBSOLETE OPTION, WILL BE REMOVED SOON!\n"); - return 1; -} - -EXPORT_SYMBOL(ide_lock); - static int ide_bus_match(struct device *dev, struct device_driver *drv) { return 1; @@ -1087,32 +1034,7 @@ out_port_class: return ret; } -#ifdef MODULE -static char *options = NULL; -module_param(options, charp, 0); -MODULE_LICENSE("GPL"); - -static void __init parse_options (char *line) -{ - char *next = line; - - if (line == NULL || !*line) - return; - while ((line = next) != NULL) { - if ((next = strchr(line,' ')) != NULL) - *next++ = 0; - if (!ide_setup(line)) - printk (KERN_INFO "Unknown option '%s'\n", line); - } -} - -int __init init_module (void) -{ - parse_options(options); - return ide_init(); -} - -void __exit cleanup_module (void) +static void __exit ide_exit(void) { proc_ide_destroy(); @@ -1121,10 +1043,7 @@ void __exit cleanup_module (void) bus_unregister(&ide_bus_type); } -#else /* !MODULE */ - -__setup("", ide_setup); - module_init(ide_init); +module_exit(ide_exit); -#endif /* MODULE */ +MODULE_LICENSE("GPL"); diff --git a/drivers/ide/legacy/gayle.c b/drivers/ide/legacy/gayle.c index fed7d812761..b78941680c3 100644 --- a/drivers/ide/legacy/gayle.c +++ b/drivers/ide/legacy/gayle.c @@ -64,9 +64,7 @@ #define GAYLE_HAS_CONTROL_REG (!ide_doubler) #define GAYLE_IDEREG_SIZE (ide_doubler ? 0x1000 : 0x2000) -int ide_doubler = 0; /* support IDE doublers? */ -EXPORT_SYMBOL_GPL(ide_doubler); - +static int ide_doubler; module_param_named(doubler, ide_doubler, bool, 0); MODULE_PARM_DESC(doubler, "enable support for IDE doublers"); #endif /* CONFIG_BLK_DEV_IDEDOUBLER */ diff --git a/include/linux/ide.h b/include/linux/ide.h index dad53565924..0fa1812d043 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -813,10 +813,6 @@ int generic_ide_ioctl(ide_drive_t *, struct file *, struct block_device *, unsig #ifndef _IDE_C extern ide_hwif_t ide_hwifs[]; /* master data repository */ #endif -extern int ide_noacpi; -extern int ide_acpigtf; -extern int ide_acpionboot; -extern int noautodma; extern int ide_vlb_clk; extern int ide_pci_clk; -- cgit v1.2.3-70-g09d2 From 9a410e79b552bacb4481f85618aa7333b7776ed7 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:48 +0200 Subject: ide: remove IDE_TFLAG_NO_SELECT_MASK taskfile flag Always call SELECT_MASK(..., 0) in ide_tf_load() (needs to be done to match ide_set_irq(..., 1)) and then remove IDE_TFLAG_NO_SELECT_MASK taskfile flag. This change should only affect hpt366 and icside host drivers since ->maskproc(..., 0) for sgiioc4 is equivalent to ide_set_irq(..., 1). Cc: Sergei Shtylyov Cc: Russell King Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 4 ++-- drivers/ide/ide-disk.c | 3 +-- drivers/ide/ide-floppy.c | 3 +-- drivers/ide/ide-iops.c | 4 +--- drivers/ide/ide-tape.c | 3 +-- drivers/scsi/ide-scsi.c | 2 +- include/linux/ide.h | 1 - 7 files changed, 7 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index ac542ffffa4..0fbc2d8d0d5 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -530,8 +530,8 @@ static ide_startstop_t cdrom_start_packet_command(ide_drive_t *drive, info->dma = !hwif->dma_ops->dma_setup(drive); /* set up the controller registers */ - ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL | - IDE_TFLAG_NO_SELECT_MASK, xferlen, info->dma); + ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL, + xferlen, info->dma); if (info->cd_flags & IDE_CD_FLAG_DRQ_INTERRUPT) { /* waiting for CDB interrupt, not DMA yet. */ diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index c5f22ef8ed2..5f49a4ae9dd 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -198,8 +198,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq, } memset(&task, 0, sizeof(task)); - task.tf_flags = IDE_TFLAG_NO_SELECT_MASK; /* FIXME? */ - task.tf_flags |= (IDE_TFLAG_TF | IDE_TFLAG_DEVICE); + task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE; if (drive->select.b.lba) { if (lba48) { diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 9161cd92a84..1852008d9ee 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -667,8 +667,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive, if ((pc->flags & PC_FLAG_DMA_RECOMMENDED) && drive->using_dma) dma = !hwif->dma_ops->dma_setup(drive); - ide_pktcmd_tf_load(drive, IDE_TFLAG_NO_SELECT_MASK | - IDE_TFLAG_OUT_DEVICE, bcount, dma); + ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma); if (dma) { /* Begin DMA, if necessary */ diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 2de4c8f581e..9f9916fe6c2 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -121,9 +121,7 @@ static void ide_tf_load(ide_drive_t *drive, ide_task_t *task) HIHI = 0xFF; ide_set_irq(drive, 1); - - if ((task->tf_flags & IDE_TFLAG_NO_SELECT_MASK) == 0) - SELECT_MASK(drive, 0); + SELECT_MASK(drive, 0); if (task->tf_flags & IDE_TFLAG_OUT_DATA) { u16 data = (tf->hob_data << 8) | tf->data; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index a5f0b774527..cc7991c7c25 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -1046,8 +1046,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, if ((pc->flags & PC_FLAG_DMA_RECOMMENDED) && drive->using_dma) dma_ok = !hwif->dma_ops->dma_setup(drive); - ide_pktcmd_tf_load(drive, IDE_TFLAG_NO_SELECT_MASK | - IDE_TFLAG_OUT_DEVICE, bcount, dma_ok); + ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma_ok); if (dma_ok) /* Will begin DMA later */ diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index da261806d62..3222aa589db 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -564,7 +564,7 @@ static ide_startstop_t idescsi_issue_pc(ide_drive_t *drive, hwif->sg_mapped = 0; } - ide_pktcmd_tf_load(drive, IDE_TFLAG_NO_SELECT_MASK, bcount, dma); + ide_pktcmd_tf_load(drive, 0, bcount, dma); if (dma) pc->flags |= PC_FLAG_DMA_OK; diff --git a/include/linux/ide.h b/include/linux/ide.h index 0fa1812d043..d4a910cdb90 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -869,7 +869,6 @@ extern void ide_end_drive_cmd(ide_drive_t *, u8, u8); enum { IDE_TFLAG_LBA48 = (1 << 0), - IDE_TFLAG_NO_SELECT_MASK = (1 << 1), IDE_TFLAG_FLAGGED = (1 << 2), IDE_TFLAG_OUT_DATA = (1 << 3), IDE_TFLAG_OUT_HOB_FEATURE = (1 << 4), -- cgit v1.2.3-70-g09d2 From ed4af48fd660176680da905817f6e40d51436e4c Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:48 +0200 Subject: ide: move IRQ unmasking out from ->tf_load method Move IRQ unmasking out from ->tf_load method to its users. There should be no functional changes caused by this patch (SELECT_MASK() is NOP except for hpt366, icside and sgiioc4). Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/h8300/ide-h8300.c | 2 -- drivers/ide/ide-io.c | 2 ++ drivers/ide/ide-iops.c | 5 +---- drivers/ide/ide-taskfile.c | 2 ++ drivers/ide/pci/scc_pata.c | 2 -- include/linux/ide.h | 1 + 6 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/h8300/ide-h8300.c b/drivers/ide/h8300/ide-h8300.c index ecf53bb0d2a..d5afc28eaae 100644 --- a/drivers/ide/h8300/ide-h8300.c +++ b/drivers/ide/h8300/ide-h8300.c @@ -52,8 +52,6 @@ static void h8300_tf_load(ide_drive_t *drive, ide_task_t *task) if (task->tf_flags & IDE_TFLAG_FLAGGED) HIHI = 0xFF; - ide_set_irq(drive, 1); - if (task->tf_flags & IDE_TFLAG_OUT_DATA) mm_outw((tf->hob_data << 8) | tf->data, io_ports->data_addr); diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index c22a337ced4..2083cc08b2c 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -1579,6 +1579,8 @@ void ide_pktcmd_tf_load(ide_drive_t *drive, u32 tf_flags, u16 bcount, u8 dma) task.tf.lbah = (bcount >> 8) & 0xff; ide_tf_dump(drive->name, &task.tf); + ide_set_irq(drive, 1); + SELECT_MASK(drive, 0); drive->hwif->tf_load(drive, &task); } diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 9f9916fe6c2..491980aab86 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -95,7 +95,7 @@ void SELECT_DRIVE (ide_drive_t *drive) hwif->OUTB(drive->select.all, hwif->io_ports.device_addr); } -static void SELECT_MASK(ide_drive_t *drive, int mask) +void SELECT_MASK(ide_drive_t *drive, int mask) { const struct ide_port_ops *port_ops = drive->hwif->port_ops; @@ -120,9 +120,6 @@ static void ide_tf_load(ide_drive_t *drive, ide_task_t *task) if (task->tf_flags & IDE_TFLAG_FLAGGED) HIHI = 0xFF; - ide_set_irq(drive, 1); - SELECT_MASK(drive, 0); - if (task->tf_flags & IDE_TFLAG_OUT_DATA) { u16 data = (tf->hob_data << 8) | tf->data; diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index b6a1c4b5112..6a17ab54f80 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -109,6 +109,8 @@ ide_startstop_t do_rw_taskfile (ide_drive_t *drive, ide_task_t *task) if ((task->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) { ide_tf_dump(drive->name, tf); + ide_set_irq(drive, 1); + SELECT_MASK(drive, 0); hwif->tf_load(drive, task); } diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c index 910fb00deb7..37e8cfcabb4 100644 --- a/drivers/ide/pci/scc_pata.c +++ b/drivers/ide/pci/scc_pata.c @@ -662,8 +662,6 @@ static void scc_tf_load(ide_drive_t *drive, ide_task_t *task) if (task->tf_flags & IDE_TFLAG_FLAGGED) HIHI = 0xFF; - ide_set_irq(drive, 1); - if (task->tf_flags & IDE_TFLAG_OUT_DATA) out_be32((void *)io_ports->data_addr, (tf->hob_data << 8) | tf->data); diff --git a/include/linux/ide.h b/include/linux/ide.h index d4a910cdb90..56d0bc2dffe 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -973,6 +973,7 @@ typedef struct ide_task_s { void ide_tf_dump(const char *, struct ide_taskfile *); extern void SELECT_DRIVE(ide_drive_t *); +void SELECT_MASK(ide_drive_t *, int); extern int drive_is_ready(ide_drive_t *); -- cgit v1.2.3-70-g09d2 From 135721446144af005109c25eeacca4fdddcd9a66 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:49 +0200 Subject: ide: remove ->mmio flag from ide_hwif_t Since scc_pata host driver no longer uses IDE PCI layer / ide_dma_setup() and all other ->mmio users set also IDE_HFLAG_MMIO host flag we can safely remove ->mmio flag. There should be no functional changes caused by this patch. Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/arm/palm_bk3710.c | 1 - drivers/ide/ide-dma.c | 2 +- drivers/ide/pci/scc_pata.c | 1 - drivers/ide/pci/siimage.c | 25 +++++++++++++------------ drivers/ide/setup-pci.c | 4 ++-- include/linux/ide.h | 1 - 6 files changed, 16 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/arm/palm_bk3710.c b/drivers/ide/arm/palm_bk3710.c index 74a05dc6d1e..3839f572298 100644 --- a/drivers/ide/arm/palm_bk3710.c +++ b/drivers/ide/arm/palm_bk3710.c @@ -405,7 +405,6 @@ static int __devinit palm_bk3710_probe(struct platform_device *pdev) ide_init_port_data(hwif, i); ide_init_port_hw(hwif, &hw); - hwif->mmio = 1; default_hwif_mmiops(hwif); idx[0] = i; diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index 174f4704614..7ee44f86bc5 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -463,7 +463,7 @@ int ide_dma_setup(ide_drive_t *drive) } /* PRD table */ - if (hwif->mmio) + if (hwif->host_flags & IDE_HFLAG_MMIO) writel(hwif->dmatable_dma, (void __iomem *)(hwif->dma_base + ATA_DMA_TABLE_OFS)); else diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c index 37e8cfcabb4..133053c7a48 100644 --- a/drivers/ide/pci/scc_pata.c +++ b/drivers/ide/pci/scc_pata.c @@ -793,7 +793,6 @@ static void __devinit init_mmio_iops_scc(ide_hwif_t *hwif) hwif->dma_base = dma_base; hwif->config_data = ports->ctl; - hwif->mmio = 1; } /** diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c index 0006b9e5856..b75e9bb390a 100644 --- a/drivers/ide/pci/siimage.c +++ b/drivers/ide/pci/siimage.c @@ -94,7 +94,7 @@ static unsigned long siimage_selreg(ide_hwif_t *hwif, int r) unsigned long base = (unsigned long)hwif->hwif_data; base += 0xA0 + r; - if (hwif->mmio) + if (hwif->host_flags & IDE_HFLAG_MMIO) base += hwif->channel << 6; else base += hwif->channel << 4; @@ -117,7 +117,7 @@ static inline unsigned long siimage_seldev(ide_drive_t *drive, int r) unsigned long base = (unsigned long)hwif->hwif_data; base += 0xA0 + r; - if (hwif->mmio) + if (hwif->host_flags & IDE_HFLAG_MMIO) base += hwif->channel << 6; else base += hwif->channel << 4; @@ -190,7 +190,9 @@ static u8 sil_pata_udma_filter(ide_drive_t *drive) unsigned long base = (unsigned long)hwif->hwif_data; u8 scsc, mask = 0; - scsc = sil_ioread8(dev, base + (hwif->mmio ? 0x4A : 0x8A)); + base += (hwif->host_flags & IDE_HFLAG_MMIO) ? 0x4A : 0x8A; + + scsc = sil_ioread8(dev, base); switch (scsc & 0x30) { case 0x10: /* 133 */ @@ -238,8 +240,9 @@ static void sil_set_pio_mode(ide_drive_t *drive, u8 pio) unsigned long tfaddr = siimage_selreg(hwif, 0x02); unsigned long base = (unsigned long)hwif->hwif_data; u8 tf_pio = pio; - u8 addr_mask = hwif->channel ? (hwif->mmio ? 0xF4 : 0x84) - : (hwif->mmio ? 0xB4 : 0x80); + u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0; + u8 addr_mask = hwif->channel ? (mmio ? 0xF4 : 0x84) + : (mmio ? 0xB4 : 0x80); u8 mode = 0; u8 unit = drive->select.b.unit; @@ -290,13 +293,13 @@ static void sil_set_dma_mode(ide_drive_t *drive, const u8 speed) u16 ultra = 0, multi = 0; u8 mode = 0, unit = drive->select.b.unit; unsigned long base = (unsigned long)hwif->hwif_data; - u8 scsc = 0, addr_mask = hwif->channel ? - (hwif->mmio ? 0xF4 : 0x84) : - (hwif->mmio ? 0xB4 : 0x80); + u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0; + u8 scsc = 0, addr_mask = hwif->channel ? (mmio ? 0xF4 : 0x84) + : (mmio ? 0xB4 : 0x80); unsigned long ma = siimage_seldev(drive, 0x08); unsigned long ua = siimage_seldev(drive, 0x0C); - scsc = sil_ioread8 (dev, base + (hwif->mmio ? 0x4A : 0x8A)); + scsc = sil_ioread8 (dev, base + (mmio ? 0x4A : 0x8A)); mode = sil_ioread8 (dev, base + addr_mask); multi = sil_ioread16(dev, ma); ultra = sil_ioread16(dev, ua); @@ -391,7 +394,7 @@ static int siimage_mmio_dma_test_irq(ide_drive_t *drive) static int siimage_dma_test_irq(ide_drive_t *drive) { - if (drive->hwif->mmio) + if (drive->hwif->host_flags & IDE_HFLAG_MMIO) return siimage_mmio_dma_test_irq(drive); else return siimage_io_dma_test_irq(drive); @@ -640,8 +643,6 @@ static void __devinit init_mmio_iops_siimage(ide_hwif_t *hwif) hwif->irq = dev->irq; hwif->dma_base = (unsigned long)addr + (ch ? 0x08 : 0x00); - - hwif->mmio = 1; } static int is_dev_seagate_sata(ide_drive_t *drive) diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c index 5171601fb25..abcfb1739d4 100644 --- a/drivers/ide/setup-pci.c +++ b/drivers/ide/setup-pci.c @@ -87,7 +87,7 @@ unsigned long ide_pci_dma_base(ide_hwif_t *hwif, const struct ide_port_info *d) unsigned long dma_base = 0; u8 dma_stat = 0; - if (hwif->mmio) + if (hwif->host_flags & IDE_HFLAG_MMIO) return hwif->dma_base; if (hwif->mate && hwif->mate->dma_base) { @@ -374,7 +374,7 @@ int ide_hwif_setup_dma(ide_hwif_t *hwif, const struct ide_port_info *d) if (base == 0 || ide_pci_set_master(dev, d->name) < 0) return -1; - if (hwif->mmio) + if (hwif->host_flags & IDE_HFLAG_MMIO) printk(KERN_INFO " %s: MMIO-DMA\n", hwif->name); else printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n", diff --git a/include/linux/ide.h b/include/linux/ide.h index 56d0bc2dffe..b01b102be4d 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -532,7 +532,6 @@ typedef struct hwif_s { unsigned serialized : 1; /* serialized all channel operation */ unsigned sharing_irq: 1; /* 1 = sharing irq with another hwif */ unsigned sg_mapped : 1; /* sg_table and sg_nents are ready */ - unsigned mmio : 1; /* host uses MMIO */ struct device gendev; struct device *portdev; -- cgit v1.2.3-70-g09d2 From f8c4bd0ab2b8783c0f080957781e9f70bee48eaa Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:49 +0200 Subject: ide: pass 'hwif *' instead of 'drive *' to ->OUTBSYNC method There should be no functional changes caused by this patch. Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-io.c | 2 +- drivers/ide/ide-iops.c | 18 +++++++++--------- drivers/ide/ide-probe.c | 6 +++--- drivers/ide/ide-taskfile.c | 2 +- drivers/ide/pci/scc_pata.c | 5 +---- drivers/ide/ppc/pmac.c | 6 +++--- drivers/scsi/ide-scsi.c | 2 +- include/linux/ide.h | 2 +- 8 files changed, 20 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 2083cc08b2c..c28fcdf0ee9 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -437,7 +437,7 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq, u if (ide_read_status(drive) & (BUSY_STAT | DRQ_STAT)) /* force an abort */ - hwif->OUTBSYNC(drive, WIN_IDLEIMMEDIATE, + hwif->OUTBSYNC(hwif, WIN_IDLEIMMEDIATE, hwif->io_ports.command_addr); if (rq->errors >= ERROR_MAX) { diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 491980aab86..4c32cf0b623 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -42,7 +42,7 @@ static void ide_outb (u8 val, unsigned long port) outb(val, port); } -static void ide_outbsync (ide_drive_t *drive, u8 addr, unsigned long port) +static void ide_outbsync(ide_hwif_t *hwif, u8 addr, unsigned long port) { outb(addr, port); } @@ -68,7 +68,7 @@ static void ide_mm_outb (u8 value, unsigned long port) writeb(value, (void __iomem *) port); } -static void ide_mm_outbsync (ide_drive_t *drive, u8 value, unsigned long port) +static void ide_mm_outbsync(ide_hwif_t *hwif, u8 value, unsigned long port) { writeb(value, (void __iomem *) port); } @@ -686,7 +686,7 @@ int ide_driveid_update(ide_drive_t *drive) SELECT_MASK(drive, 1); ide_set_irq(drive, 0); msleep(50); - hwif->OUTBSYNC(drive, WIN_IDENTIFY, hwif->io_ports.command_addr); + hwif->OUTBSYNC(hwif, WIN_IDENTIFY, hwif->io_ports.command_addr); timeout = jiffies + WAIT_WORSTCASE; do { if (time_after(jiffies, timeout)) { @@ -773,7 +773,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) ide_set_irq(drive, 0); hwif->OUTB(speed, io_ports->nsect_addr); hwif->OUTB(SETFEATURES_XFER, io_ports->feature_addr); - hwif->OUTBSYNC(drive, WIN_SETFEATURES, io_ports->command_addr); + hwif->OUTBSYNC(hwif, WIN_SETFEATURES, io_ports->command_addr); if (drive->quirk_list == 2) ide_set_irq(drive, 1); @@ -881,7 +881,7 @@ void ide_execute_command(ide_drive_t *drive, u8 cmd, ide_handler_t *handler, spin_lock_irqsave(&ide_lock, flags); __ide_set_handler(drive, handler, timeout, expiry); - hwif->OUTBSYNC(drive, cmd, hwif->io_ports.command_addr); + hwif->OUTBSYNC(hwif, cmd, hwif->io_ports.command_addr); /* * Drive takes 400nS to respond, we must avoid the IRQ being * serviced before that. @@ -899,7 +899,7 @@ void ide_execute_pkt_cmd(ide_drive_t *drive) unsigned long flags; spin_lock_irqsave(&ide_lock, flags); - hwif->OUTBSYNC(drive, WIN_PACKETCMD, hwif->io_ports.command_addr); + hwif->OUTBSYNC(hwif, WIN_PACKETCMD, hwif->io_ports.command_addr); ndelay(400); spin_unlock_irqrestore(&ide_lock, flags); } @@ -1094,7 +1094,7 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi) pre_reset(drive); SELECT_DRIVE(drive); udelay (20); - hwif->OUTBSYNC(drive, WIN_SRST, io_ports->command_addr); + hwif->OUTBSYNC(hwif, WIN_SRST, io_ports->command_addr); ndelay(400); hwgroup->poll_timeout = jiffies + WAIT_WORSTCASE; hwgroup->polling = 1; @@ -1125,14 +1125,14 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi) * recover from reset very quickly, saving us the first 50ms wait time. */ /* set SRST and nIEN */ - hwif->OUTBSYNC(drive, drive->ctl|6, io_ports->ctl_addr); + hwif->OUTBSYNC(hwif, drive->ctl | 6, io_ports->ctl_addr); /* more than enough time */ udelay(10); if (drive->quirk_list == 2) ctl = drive->ctl; /* clear SRST and nIEN */ else ctl = drive->ctl | 2; /* clear SRST, leave nIEN */ - hwif->OUTBSYNC(drive, ctl, io_ports->ctl_addr); + hwif->OUTBSYNC(hwif, ctl, io_ports->ctl_addr); /* more than enough time */ udelay(10); hwgroup->poll_timeout = jiffies + WAIT_WORSTCASE; diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 12513c45d70..b010633eb5b 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -293,7 +293,7 @@ static int actual_try_to_identify (ide_drive_t *drive, u8 cmd) hwif->OUTB(0, io_ports->feature_addr); /* ask drive for ID */ - hwif->OUTBSYNC(drive, cmd, io_ports->command_addr); + hwif->OUTBSYNC(hwif, cmd, hwif->io_ports.command_addr); timeout = ((cmd == WIN_IDENTIFY) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2; timeout += jiffies; @@ -480,7 +480,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd) msleep(50); SELECT_DRIVE(drive); msleep(50); - hwif->OUTBSYNC(drive, WIN_SRST, io_ports->command_addr); + hwif->OUTBSYNC(hwif, WIN_SRST, io_ports->command_addr); (void)ide_busy_sleep(hwif); rc = try_to_identify(drive, cmd); } @@ -516,7 +516,7 @@ static void enable_nest (ide_drive_t *drive) printk("%s: enabling %s -- ", hwif->name, drive->id->model); SELECT_DRIVE(drive); msleep(50); - hwif->OUTBSYNC(drive, EXABYTE_ENABLE_NEST, hwif->io_ports.command_addr); + hwif->OUTBSYNC(hwif, EXABYTE_ENABLE_NEST, hwif->io_ports.command_addr); if (ide_busy_sleep(hwif)) { printk(KERN_CONT "failed (timeout)\n"); diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 6a17ab54f80..cf55a48a7dd 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -117,7 +117,7 @@ ide_startstop_t do_rw_taskfile (ide_drive_t *drive, ide_task_t *task) switch (task->data_phase) { case TASKFILE_MULTI_OUT: case TASKFILE_OUT: - hwif->OUTBSYNC(drive, tf->command, hwif->io_ports.command_addr); + hwif->OUTBSYNC(hwif, tf->command, hwif->io_ports.command_addr); ndelay(400); /* FIXME */ return pre_task_out_intr(drive, task->rq); case TASKFILE_MULTI_IN: diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c index 133053c7a48..32eb0877fce 100644 --- a/drivers/ide/pci/scc_pata.c +++ b/drivers/ide/pci/scc_pata.c @@ -148,11 +148,8 @@ static void scc_ide_outb(u8 addr, unsigned long port) out_be32((void*)port, addr); } -static void -scc_ide_outbsync(ide_drive_t * drive, u8 addr, unsigned long port) +static void scc_ide_outbsync(ide_hwif_t *hwif, u8 addr, unsigned long port) { - ide_hwif_t *hwif = HWIF(drive); - out_be32((void*)port, addr); eieio(); in_be32((void*)(hwif->dma_base + 0x01c)); diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c index ba2d5872796..dcb2c466bb9 100644 --- a/drivers/ide/ppc/pmac.c +++ b/drivers/ide/ppc/pmac.c @@ -480,13 +480,13 @@ pmac_ide_do_update_timings(ide_drive_t *drive) pmac_ide_selectproc(drive); } -static void -pmac_outbsync(ide_drive_t *drive, u8 value, unsigned long port) +static void pmac_outbsync(ide_hwif_t *hwif, u8 value, unsigned long port) { u32 tmp; writeb(value, (void __iomem *) port); - tmp = readl(PMAC_IDE_REG(IDE_TIMING_CONFIG)); + tmp = readl((void __iomem *)(hwif->io_ports.data_addr + + IDE_TIMING_CONFIG)); } /* diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index 3222aa589db..d7fd5e550a2 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -257,7 +257,7 @@ idescsi_atapi_error(ide_drive_t *drive, struct request *rq, u8 stat, u8 err) if (ide_read_status(drive) & (BUSY_STAT | DRQ_STAT)) /* force an abort */ - hwif->OUTBSYNC(drive, WIN_IDLEIMMEDIATE, + hwif->OUTBSYNC(hwif, WIN_IDLEIMMEDIATE, hwif->io_ports.command_addr); rq->errors++; diff --git a/include/linux/ide.h b/include/linux/ide.h index b01b102be4d..1c343146964 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -493,7 +493,7 @@ typedef struct hwif_s { void (*ide_dma_clear_irq)(ide_drive_t *drive); void (*OUTB)(u8 addr, unsigned long port); - void (*OUTBSYNC)(ide_drive_t *drive, u8 addr, unsigned long port); + void (*OUTBSYNC)(struct hwif_s *hwif, u8 addr, unsigned long port); u8 (*INB)(unsigned long port); -- cgit v1.2.3-70-g09d2 From 0fd04dcc2ebb6ec9088c24b368b0ce1f42a98ef5 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:50 +0200 Subject: ide: use ->OUTBSYNC in ide_set_irq() Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 1c343146964..4d1c9714f1d 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1340,7 +1340,8 @@ static inline void ide_set_irq(ide_drive_t *drive, int on) { ide_hwif_t *hwif = drive->hwif; - hwif->OUTB(drive->ctl | (on ? 0 : 2), hwif->io_ports.ctl_addr); + hwif->OUTBSYNC(hwif, drive->ctl | (on ? 0 : 2), + hwif->io_ports.ctl_addr); } static inline u8 ide_read_status(ide_drive_t *drive) -- cgit v1.2.3-70-g09d2 From ff07488346702f554aaeb6aae982540aa0302373 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:50 +0200 Subject: ide: remove drive->ctl Remove drive->ctl (it is always equal to 0x08 after init time). While at it: * Use ATA_DEVCTL_OBS define. There should be no functional changes caused by this patch. Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/h8300/ide-h8300.c | 4 ++-- drivers/ide/ide-iops.c | 10 +++++----- drivers/ide/ide-probe.c | 2 +- drivers/ide/ide.c | 1 - drivers/ide/pci/hpt366.c | 3 +-- drivers/ide/pci/ns87415.c | 4 ++-- drivers/ide/pci/scc_pata.c | 4 ++-- drivers/ide/pci/sgiioc4.c | 2 +- include/linux/ide.h | 3 +-- 9 files changed, 15 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/h8300/ide-h8300.c b/drivers/ide/h8300/ide-h8300.c index d5afc28eaae..ae37ee58bae 100644 --- a/drivers/ide/h8300/ide-h8300.c +++ b/drivers/ide/h8300/ide-h8300.c @@ -96,7 +96,7 @@ static void h8300_tf_read(ide_drive_t *drive, ide_task_t *task) } /* be sure we're looking at the low order bits */ - outb(drive->ctl & ~0x80, io_ports->ctl_addr); + outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); if (task->tf_flags & IDE_TFLAG_IN_NSECT) tf->nsect = inb(io_ports->nsect_addr); @@ -110,7 +110,7 @@ static void h8300_tf_read(ide_drive_t *drive, ide_task_t *task) tf->device = inb(io_ports->device_addr); if (task->tf_flags & IDE_TFLAG_LBA48) { - outb(drive->ctl | 0x80, io_ports->ctl_addr); + outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) tf->hob_feature = inb(io_ports->feature_addr); diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 4c32cf0b623..80ad4f234f3 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -186,7 +186,7 @@ static void ide_tf_read(ide_drive_t *drive, ide_task_t *task) } /* be sure we're looking at the low order bits */ - tf_outb(drive->ctl & ~0x80, io_ports->ctl_addr); + tf_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); if (task->tf_flags & IDE_TFLAG_IN_NSECT) tf->nsect = tf_inb(io_ports->nsect_addr); @@ -200,7 +200,7 @@ static void ide_tf_read(ide_drive_t *drive, ide_task_t *task) tf->device = tf_inb(io_ports->device_addr); if (task->tf_flags & IDE_TFLAG_LBA48) { - tf_outb(drive->ctl | 0x80, io_ports->ctl_addr); + tf_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) tf->hob_feature = tf_inb(io_ports->feature_addr); @@ -1125,13 +1125,13 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi) * recover from reset very quickly, saving us the first 50ms wait time. */ /* set SRST and nIEN */ - hwif->OUTBSYNC(hwif, drive->ctl | 6, io_ports->ctl_addr); + hwif->OUTBSYNC(hwif, ATA_DEVCTL_OBS | 6, io_ports->ctl_addr); /* more than enough time */ udelay(10); if (drive->quirk_list == 2) - ctl = drive->ctl; /* clear SRST and nIEN */ + ctl = ATA_DEVCTL_OBS; /* clear SRST and nIEN */ else - ctl = drive->ctl | 2; /* clear SRST, leave nIEN */ + ctl = ATA_DEVCTL_OBS | 2; /* clear SRST, leave nIEN */ hwif->OUTBSYNC(hwif, ctl, io_ports->ctl_addr); /* more than enough time */ udelay(10); diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 809362b13c9..d21e51a02c3 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -1065,7 +1065,7 @@ static int init_irq (ide_hwif_t *hwif) if (io_ports->ctl_addr) /* clear nIEN */ - hwif->OUTBSYNC(hwif, 0x08, io_ports->ctl_addr); + hwif->OUTBSYNC(hwif, ATA_DEVCTL_OBS, io_ports->ctl_addr); if (request_irq(hwif->irq,&ide_intr,sa,hwif->name,hwgroup)) goto out_unlink; diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c index 1defba3eefe..2b8453510e0 100644 --- a/drivers/ide/ide.c +++ b/drivers/ide/ide.c @@ -136,7 +136,6 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif) drive->media = ide_disk; drive->select.all = (unit<<4)|0xa0; drive->hwif = hwif; - drive->ctl = 0x08; drive->ready_stat = READY_STAT; drive->bad_wstat = BAD_W_STAT; drive->special.b.recalibrate = 1; diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c index c929dadaaaf..397c6cbe953 100644 --- a/drivers/ide/pci/hpt366.c +++ b/drivers/ide/pci/hpt366.c @@ -759,8 +759,7 @@ static void hpt3xx_maskproc(ide_drive_t *drive, int mask) enable_irq (hwif->irq); } } else - outb(mask ? (drive->ctl | 2) : (drive->ctl & ~2), - hwif->io_ports.ctl_addr); + outb(ATA_DEVCTL_OBS | (mask ? 2 : 0), hwif->io_ports.ctl_addr); } /* diff --git a/drivers/ide/pci/ns87415.c b/drivers/ide/pci/ns87415.c index a7a41bb8277..45ba71a7182 100644 --- a/drivers/ide/pci/ns87415.c +++ b/drivers/ide/pci/ns87415.c @@ -76,7 +76,7 @@ static void superio_tf_read(ide_drive_t *drive, ide_task_t *task) } /* be sure we're looking at the low order bits */ - outb(drive->ctl & ~0x80, io_ports->ctl_addr); + outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); if (task->tf_flags & IDE_TFLAG_IN_NSECT) tf->nsect = inb(io_ports->nsect_addr); @@ -90,7 +90,7 @@ static void superio_tf_read(ide_drive_t *drive, ide_task_t *task) tf->device = superio_ide_inb(io_ports->device_addr); if (task->tf_flags & IDE_TFLAG_LBA48) { - outb(drive->ctl | 0x80, io_ports->ctl_addr); + outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) tf->hob_feature = inb(io_ports->feature_addr); diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c index 32eb0877fce..1584ebb6a18 100644 --- a/drivers/ide/pci/scc_pata.c +++ b/drivers/ide/pci/scc_pata.c @@ -703,7 +703,7 @@ static void scc_tf_read(ide_drive_t *drive, ide_task_t *task) } /* be sure we're looking at the low order bits */ - scc_ide_outb(drive->ctl & ~0x80, io_ports->ctl_addr); + scc_ide_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); if (task->tf_flags & IDE_TFLAG_IN_NSECT) tf->nsect = scc_ide_inb(io_ports->nsect_addr); @@ -717,7 +717,7 @@ static void scc_tf_read(ide_drive_t *drive, ide_task_t *task) tf->device = scc_ide_inb(io_ports->device_addr); if (task->tf_flags & IDE_TFLAG_LBA48) { - scc_ide_outb(drive->ctl | 0x80, io_ports->ctl_addr); + scc_ide_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) tf->hob_feature = scc_ide_inb(io_ports->feature_addr); diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c index c1b667c53f2..24513e3dcd6 100644 --- a/drivers/ide/pci/sgiioc4.c +++ b/drivers/ide/pci/sgiioc4.c @@ -111,7 +111,7 @@ sgiioc4_init_hwif_ports(hw_regs_t * hw, unsigned long data_port, static void sgiioc4_maskproc(ide_drive_t * drive, int mask) { - writeb(mask ? (drive->ctl | 2) : (drive->ctl & ~2), + writeb(ATA_DEVCTL_OBS | (mask ? 2 : 0), (void __iomem *)drive->hwif->io_ports.ctl_addr); } diff --git a/include/linux/ide.h b/include/linux/ide.h index 4d1c9714f1d..d8c86f0362c 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -364,7 +364,6 @@ typedef struct ide_drive_s { u8 wcache; /* status of write cache */ u8 acoustic; /* acoustic management */ u8 media; /* disk, cdrom, tape, floppy, ... */ - u8 ctl; /* "normal" value for Control register */ u8 ready_stat; /* min status value for drive ready */ u8 mult_count; /* current multiple sector setting */ u8 mult_req; /* requested multiple sector setting */ @@ -1340,7 +1339,7 @@ static inline void ide_set_irq(ide_drive_t *drive, int on) { ide_hwif_t *hwif = drive->hwif; - hwif->OUTBSYNC(hwif, drive->ctl | (on ? 0 : 2), + hwif->OUTBSYNC(hwif, ATA_DEVCTL_OBS | (on ? 0 : 2), hwif->io_ports.ctl_addr); } -- cgit v1.2.3-70-g09d2 From 63f5abb0959337db0d5bece9cefba03cdcadec51 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 15 Jul 2008 21:21:51 +0200 Subject: ide: remove action argument in ide_do_drive_cmd ide_do_drive_cmd is called only with ide_preempt action argument. So we can remove the action argument in ide_do_drive_cmd and ide_action_t typedef. This patch also includes two minor cleanups: 1) ide_do_drive_cmd always succeeds so we don't need the return value; 2) the callers use blk_rq_init before ide_do_drive_cmd so there is no need to initialize rq->errors. Signed-off-by: FUJITA Tomonori Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-cd.c | 2 +- drivers/ide/ide-floppy.c | 2 +- drivers/ide/ide-io.c | 40 +++++++++------------------------------- drivers/ide/ide-tape.c | 2 +- drivers/scsi/ide-scsi.c | 3 ++- include/linux/ide.h | 12 +----------- 6 files changed, 15 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 0fbc2d8d0d5..043129c422f 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -213,7 +213,7 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense, /* NOTE! Save the failed command in "rq->buffer" */ rq->buffer = (void *) failed_command; - (void) ide_do_drive_cmd(drive, rq, ide_preempt); + ide_do_drive_cmd(drive, rq); } static void cdrom_end_request(ide_drive_t *drive, int uptodate) diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 1852008d9ee..53209a47393 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -291,7 +291,7 @@ static void idefloppy_queue_pc_head(ide_drive_t *drive, struct ide_atapi_pc *pc, rq->cmd_type = REQ_TYPE_SPECIAL; rq->cmd_flags |= REQ_PREEMPT; rq->rq_disk = floppy->disk; - (void) ide_do_drive_cmd(drive, rq, ide_preempt); + ide_do_drive_cmd(drive, rq); } static struct ide_atapi_pc *idefloppy_next_pc_storage(ide_drive_t *drive) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index c28fcdf0ee9..28057747c1f 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -1520,49 +1520,27 @@ irqreturn_t ide_intr (int irq, void *dev_id) * ide_do_drive_cmd - issue IDE special command * @drive: device to issue command * @rq: request to issue - * @action: action for processing * * This function issues a special IDE device request * onto the request queue. * - * If action is ide_wait, then the rq is queued at the end of the - * request queue, and the function sleeps until it has been processed. - * This is for use when invoked from an ioctl handler. - * - * If action is ide_preempt, then the rq is queued at the head of - * the request queue, displacing the currently-being-processed - * request and this function returns immediately without waiting - * for the new rq to be completed. This is VERY DANGEROUS, and is - * intended for careful use by the ATAPI tape/cdrom driver code. - * - * If action is ide_end, then the rq is queued at the end of the - * request queue, and the function returns immediately without waiting - * for the new rq to be completed. This is again intended for careful - * use by the ATAPI tape/cdrom driver code. + * the rq is queued at the head of the request queue, displacing + * the currently-being-processed request and this function + * returns immediately without waiting for the new rq to be + * completed. This is VERY DANGEROUS, and is intended for + * careful use by the ATAPI tape/cdrom driver code. */ - -int ide_do_drive_cmd (ide_drive_t *drive, struct request *rq, ide_action_t action) + +void ide_do_drive_cmd(ide_drive_t *drive, struct request *rq) { unsigned long flags; ide_hwgroup_t *hwgroup = HWGROUP(drive); - int where = ELEVATOR_INSERT_BACK; - - rq->errors = 0; - - if (action == ide_preempt) - where = ELEVATOR_INSERT_FRONT; spin_lock_irqsave(&ide_lock, flags); - if (action == ide_preempt) - hwgroup->rq = NULL; - __elv_add_request(drive->queue, rq, where, 1); + hwgroup->rq = NULL; + __elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 1); __generic_unplug_device(drive->queue); - /* the queue is stopped so it won't be plugged+unplugged */ - if (blk_pm_resume_request(rq)) - do_ide_request(drive->queue); spin_unlock_irqrestore(&ide_lock, flags); - - return 0; } EXPORT_SYMBOL(ide_do_drive_cmd); diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index cc7991c7c25..a562df82077 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -691,7 +691,7 @@ static void idetape_queue_pc_head(ide_drive_t *drive, struct ide_atapi_pc *pc, rq->cmd_flags |= REQ_PREEMPT; rq->buffer = (char *) pc; rq->rq_disk = tape->disk; - (void) ide_do_drive_cmd(drive, rq, ide_preempt); + ide_do_drive_cmd(drive, rq); } /* diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index 58e30efe7a7..569ffde6d04 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -245,7 +245,8 @@ static int idescsi_check_condition(ide_drive_t *drive, ide_scsi_hex_dump(pc->c, 6); } rq->rq_disk = scsi->disk; - return ide_do_drive_cmd(drive, rq, ide_preempt); + ide_do_drive_cmd(drive, rq); + return 0; } static int idescsi_end_request(ide_drive_t *, int, int); diff --git a/include/linux/ide.h b/include/linux/ide.h index d8c86f0362c..04267dc1edf 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -851,17 +851,7 @@ int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long); extern ide_startstop_t ide_do_reset (ide_drive_t *); -/* - * "action" parameter type for ide_do_drive_cmd() below. - */ -typedef enum { - ide_wait, /* insert rq at end of list, and wait for it */ - ide_preempt, /* insert rq in front of current request */ - ide_head_wait, /* insert rq in front of current request and wait for it */ - ide_end /* insert rq at end of list, but don't wait for it */ -} ide_action_t; - -extern int ide_do_drive_cmd(ide_drive_t *, struct request *, ide_action_t); +extern void ide_do_drive_cmd(ide_drive_t *, struct request *); extern void ide_end_drive_cmd(ide_drive_t *, u8, u8); -- cgit v1.2.3-70-g09d2 From 92f5daff2b8439fa4c57c57f47823ffc459c3bd9 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:55 +0200 Subject: ide-tape: make pc->idetape_callback void There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-tape.c | 13 +++++++------ include/linux/ide.h | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index d387aaf0eb3..88d26efdf84 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -619,7 +619,7 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects) return 0; } -static ide_startstop_t ide_tape_callback(ide_drive_t *drive) +static void ide_tape_callback(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; struct ide_atapi_pc *pc = tape->pc; @@ -675,8 +675,6 @@ static ide_startstop_t ide_tape_callback(ide_drive_t *drive) } idetape_end_request(drive, uptodate, 0); - - return ide_stopped; } static void idetape_init_pc(struct ide_atapi_pc *pc) @@ -843,7 +841,8 @@ static ide_startstop_t idetape_pc_intr(ide_drive_t *drive) if (tape->failed_pc == pc) tape->failed_pc = NULL; /* Command finished - Call the callback function */ - return pc->idetape_callback(drive); + pc->idetape_callback(drive); + return ide_stopped; } if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) { @@ -1035,7 +1034,8 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, pc->error = IDETAPE_ERROR_GENERAL; } tape->failed_pc = NULL; - return pc->idetape_callback(drive); + pc->idetape_callback(drive); + return ide_stopped; } debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]); @@ -1120,7 +1120,8 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive) pc->error = IDETAPE_ERROR_GENERAL; tape->failed_pc = NULL; } - return pc->idetape_callback(drive); + pc->idetape_callback(drive); + return ide_stopped; } static void idetape_create_read_cmd(idetape_tape_t *tape, diff --git a/include/linux/ide.h b/include/linux/ide.h index 04267dc1edf..8936b21a703 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -641,7 +641,7 @@ struct ide_atapi_pc { */ u8 pc_buf[256]; void (*idefloppy_callback) (ide_drive_t *); - ide_startstop_t (*idetape_callback) (ide_drive_t *); + void (*idetape_callback) (ide_drive_t *); /* idetape only */ struct idetape_bh *bh; -- cgit v1.2.3-70-g09d2 From 1b06e92aa03018e4b3ba281e03a7711d9b71a998 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:56 +0200 Subject: ide-{floppy,tape}: merge pc->idefloppy_callback and pc->idetape_callback Merge pc->idefloppy_callback and pc->idetape_callback into pc->callback. There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-floppy.c | 6 +++--- drivers/ide/ide-tape.c | 8 ++++---- include/linux/ide.h | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 2058a6f3f33..a9f3127a74e 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -354,7 +354,7 @@ static void idefloppy_init_pc(struct ide_atapi_pc *pc) pc->req_xfer = 0; pc->buf = pc->pc_buf; pc->buf_size = IDEFLOPPY_PC_BUFFER_SIZE; - pc->idefloppy_callback = &ide_floppy_callback; + pc->callback = ide_floppy_callback; } static void idefloppy_create_request_sense_cmd(struct ide_atapi_pc *pc) @@ -438,7 +438,7 @@ static ide_startstop_t idefloppy_pc_intr(ide_drive_t *drive) if (floppy->failed_pc == pc) floppy->failed_pc = NULL; /* Command finished - Call the callback function */ - pc->idefloppy_callback(drive); + pc->callback(drive); return ide_stopped; } @@ -612,7 +612,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive, pc->error = IDEFLOPPY_ERROR_GENERAL; floppy->failed_pc = NULL; - pc->idefloppy_callback(drive); + pc->callback(drive); return ide_stopped; } diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 88d26efdf84..ce9b6d32752 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -687,7 +687,7 @@ static void idetape_init_pc(struct ide_atapi_pc *pc) pc->buf_size = IDETAPE_PC_BUFFER_SIZE; pc->bh = NULL; pc->b_data = NULL; - pc->idetape_callback = ide_tape_callback; + pc->callback = ide_tape_callback; } static void idetape_create_request_sense_cmd(struct ide_atapi_pc *pc) @@ -841,7 +841,7 @@ static ide_startstop_t idetape_pc_intr(ide_drive_t *drive) if (tape->failed_pc == pc) tape->failed_pc = NULL; /* Command finished - Call the callback function */ - pc->idetape_callback(drive); + pc->callback(drive); return ide_stopped; } @@ -1034,7 +1034,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, pc->error = IDETAPE_ERROR_GENERAL; } tape->failed_pc = NULL; - pc->idetape_callback(drive); + pc->callback(drive); return ide_stopped; } debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]); @@ -1120,7 +1120,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive) pc->error = IDETAPE_ERROR_GENERAL; tape->failed_pc = NULL; } - pc->idetape_callback(drive); + pc->callback(drive); return ide_stopped; } diff --git a/include/linux/ide.h b/include/linux/ide.h index 8936b21a703..f079456adfd 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -640,8 +640,8 @@ struct ide_atapi_pc { * to change/removal later. */ u8 pc_buf[256]; - void (*idefloppy_callback) (ide_drive_t *); - void (*idetape_callback) (ide_drive_t *); + + void (*callback)(ide_drive_t *); /* idetape only */ struct idetape_bh *bh; -- cgit v1.2.3-70-g09d2 From 5e3310958204912f3f00be2592c945fbc37db6ae Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:56 +0200 Subject: ide-{floppy,tape}: PC_FLAG_DMA_RECOMMENDED -> PC_FLAG_DMA_OK * Use PC_FLAG_DMA_OK flag instead of PC_FLAG_DMA_RECOMMENDED one. * Remove no longer used PC_FLAG_DMA_RECOMMENDED flag. There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-floppy.c | 6 +++--- drivers/ide/ide-tape.c | 6 +++--- include/linux/ide.h | 9 ++++----- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index a9f3127a74e..dbefe35c139 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -630,7 +630,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive, } dma = 0; - if ((pc->flags & PC_FLAG_DMA_RECOMMENDED) && drive->using_dma) + if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma) dma = !hwif->dma_ops->dma_setup(drive); ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma); @@ -755,7 +755,7 @@ static void idefloppy_create_rw_cmd(idefloppy_floppy_t *floppy, pc->flags |= PC_FLAG_WRITING; pc->buf = NULL; pc->req_xfer = pc->buf_size = blocks * floppy->block_size; - pc->flags |= PC_FLAG_DMA_RECOMMENDED; + pc->flags |= PC_FLAG_DMA_OK; } static void idefloppy_blockpc_cmd(idefloppy_floppy_t *floppy, @@ -769,7 +769,7 @@ static void idefloppy_blockpc_cmd(idefloppy_floppy_t *floppy, pc->flags |= PC_FLAG_WRITING; pc->buf = rq->data; if (rq->bio) - pc->flags |= PC_FLAG_DMA_RECOMMENDED; + pc->flags |= PC_FLAG_DMA_OK; /* * possibly problematic, doesn't look like ide-floppy correctly * handled scattered requests if dma fails... diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index ce9b6d32752..e8a5852fa2d 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -1050,7 +1050,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, pc->flags &= ~PC_FLAG_DMA_ERROR; ide_dma_off(drive); } - if ((pc->flags & PC_FLAG_DMA_RECOMMENDED) && drive->using_dma) + if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma) dma_ok = !hwif->dma_ops->dma_setup(drive); ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma_ok); @@ -1138,7 +1138,7 @@ static void idetape_create_read_cmd(idetape_tape_t *tape, pc->buf_size = length * tape->blk_size; pc->req_xfer = pc->buf_size; if (pc->req_xfer == tape->buffer_size) - pc->flags |= PC_FLAG_DMA_RECOMMENDED; + pc->flags |= PC_FLAG_DMA_OK; } static void idetape_create_write_cmd(idetape_tape_t *tape, @@ -1157,7 +1157,7 @@ static void idetape_create_write_cmd(idetape_tape_t *tape, pc->buf_size = length * tape->blk_size; pc->req_xfer = pc->buf_size; if (pc->req_xfer == tape->buffer_size) - pc->flags |= PC_FLAG_DMA_RECOMMENDED; + pc->flags |= PC_FLAG_DMA_OK; } static ide_startstop_t idetape_do_request(ide_drive_t *drive, diff --git a/include/linux/ide.h b/include/linux/ide.h index f079456adfd..63cee2947f6 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -602,12 +602,11 @@ enum { PC_FLAG_SUPPRESS_ERROR = (1 << 1), PC_FLAG_WAIT_FOR_DSC = (1 << 2), PC_FLAG_DMA_OK = (1 << 3), - PC_FLAG_DMA_RECOMMENDED = (1 << 4), - PC_FLAG_DMA_IN_PROGRESS = (1 << 5), - PC_FLAG_DMA_ERROR = (1 << 6), - PC_FLAG_WRITING = (1 << 7), + PC_FLAG_DMA_IN_PROGRESS = (1 << 4), + PC_FLAG_DMA_ERROR = (1 << 5), + PC_FLAG_WRITING = (1 << 6), /* command timed out */ - PC_FLAG_TIMEDOUT = (1 << 8), + PC_FLAG_TIMEDOUT = (1 << 7), }; struct ide_atapi_pc { -- cgit v1.2.3-70-g09d2 From 5d41893c0f9caf94b449eada0279a08c86f0212e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:57 +0200 Subject: ide: add PC_FLAG_ZIP_DRIVE pc flag Add PC_FLAG_ZIP_DRIVE pc flag, set it in idefloppy_do_request() and check for it (instead of checking for IDEFLOPPY_FLAG_ZIP_DRIVE) in idefloppy_transfer_pc(). This is a preparation for adding generic ide_transfer_pc() helper. There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-floppy.c | 8 ++++++-- include/linux/ide.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 1df6a314359..cff90c4b217 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -559,7 +559,7 @@ static ide_startstop_t idefloppy_transfer_pc1(ide_drive_t *drive) * 40 and 50msec work well. idefloppy_pc_intr will not be actually * used until after the packet is moved in about 50 msec. */ - if (floppy->flags & IDEFLOPPY_FLAG_ZIP_DRIVE) { + if (pc->flags & PC_FLAG_ZIP_DRIVE) { timeout = floppy->ticks; expiry = &idefloppy_transfer_pc2; } else { @@ -575,7 +575,7 @@ static ide_startstop_t idefloppy_transfer_pc1(ide_drive_t *drive) hwif->dma_ops->dma_start(drive); } - if ((floppy->flags & IDEFLOPPY_FLAG_ZIP_DRIVE) == 0) + if ((pc->flags & PC_FLAG_ZIP_DRIVE) == 0) /* Send the actual packet */ hwif->output_data(drive, NULL, floppy->pc->c, 12); @@ -826,7 +826,11 @@ static ide_startstop_t idefloppy_do_request(ide_drive_t *drive, return ide_stopped; } + if (floppy->flags & IDEFLOPPY_FLAG_ZIP_DRIVE) + pc->flags |= PC_FLAG_ZIP_DRIVE; + pc->rq = rq; + return idefloppy_issue_pc(drive, pc); } diff --git a/include/linux/ide.h b/include/linux/ide.h index 63cee2947f6..89feaea9e20 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -607,6 +607,7 @@ enum { PC_FLAG_WRITING = (1 << 6), /* command timed out */ PC_FLAG_TIMEDOUT = (1 << 7), + PC_FLAG_ZIP_DRIVE = (1 << 8), }; struct ide_atapi_pc { -- cgit v1.2.3-70-g09d2 From 594c16d8dd54cd7b1c5ef1ec3ac0f6bf34301dad Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:58 +0200 Subject: ide: add ide_transfer_pc() helper * Add ide-atapi.c file for generic ATAPI support together with CONFIG_IDE_ATAPI config option. * Add generic ide_transfer_pc() helper to ide-atapi.c and then convert ide-{floppy,tape,scsi} device drivers to use it. There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/Kconfig | 6 +++++ drivers/ide/Makefile | 1 + drivers/ide/ide-atapi.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/ide/ide-floppy.c | 28 +------------------ drivers/ide/ide-tape.c | 56 ++------------------------------------ drivers/scsi/ide-scsi.c | 30 ++------------------- include/linux/ide.h | 3 +++ 7 files changed, 85 insertions(+), 109 deletions(-) create mode 100644 drivers/ide/ide-atapi.c (limited to 'include/linux') diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index 1607536ff5f..cf707c8f08d 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -98,6 +98,9 @@ if BLK_DEV_IDE comment "Please see Documentation/ide/ide.txt for help/info on IDE drives" +config IDE_ATAPI + bool + config BLK_DEV_IDE_SATA bool "Support for SATA (deprecated; conflicts with libata SATA driver)" default n @@ -201,6 +204,7 @@ config BLK_DEV_IDECD_VERBOSE_ERRORS config BLK_DEV_IDETAPE tristate "Include IDE/ATAPI TAPE support" + select IDE_ATAPI help If you have an IDE tape drive using the ATAPI protocol, say Y. ATAPI is a newer protocol used by IDE tape and CD-ROM drives, @@ -223,6 +227,7 @@ config BLK_DEV_IDETAPE config BLK_DEV_IDEFLOPPY tristate "Include IDE/ATAPI FLOPPY support" + select IDE_ATAPI ---help--- If you have an IDE floppy drive which uses the ATAPI protocol, answer Y. ATAPI is a newer protocol used by IDE CD-ROM/tape/floppy @@ -246,6 +251,7 @@ config BLK_DEV_IDEFLOPPY config BLK_DEV_IDESCSI tristate "SCSI emulation support" depends on SCSI + select IDE_ATAPI ---help--- WARNING: ide-scsi is no longer needed for cd writing applications! The 2.6 kernel supports direct writing to ide-cd, which eliminates diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile index f94b679b611..a2b3f84d710 100644 --- a/drivers/ide/Makefile +++ b/drivers/ide/Makefile @@ -14,6 +14,7 @@ EXTRA_CFLAGS += -Idrivers/ide ide-core-y += ide.o ide-io.o ide-iops.o ide-lib.o ide-probe.o ide-taskfile.o # core IDE code +ide-core-$(CONFIG_IDE_ATAPI) += ide-atapi.o ide-core-$(CONFIG_BLK_DEV_IDEPCI) += setup-pci.o ide-core-$(CONFIG_BLK_DEV_IDEDMA) += ide-dma.o ide-core-$(CONFIG_IDE_PROC_FS) += ide-proc.o diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c new file mode 100644 index 00000000000..25939bc6040 --- /dev/null +++ b/drivers/ide/ide-atapi.c @@ -0,0 +1,70 @@ +/* + * ATAPI support. + */ + +#include +#include +#include + +static u8 ide_wait_ireason(ide_drive_t *drive, u8 ireason) +{ + ide_hwif_t *hwif = drive->hwif; + int retries = 100; + + while (retries-- && ((ireason & CD) == 0 || (ireason & IO))) { + printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing " + "a packet command, retrying\n", drive->name); + udelay(100); + ireason = hwif->INB(hwif->io_ports.nsect_addr); + if (retries == 0) { + printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing " + "a packet command, ignoring\n", + drive->name); + ireason |= CD; + ireason &= ~IO; + } + } + + return ireason; +} + +ide_startstop_t ide_transfer_pc(ide_drive_t *drive, struct ide_atapi_pc *pc, + ide_handler_t *handler, unsigned int timeout, + ide_expiry_t *expiry) +{ + ide_hwif_t *hwif = drive->hwif; + ide_startstop_t startstop; + u8 ireason; + + if (ide_wait_stat(&startstop, drive, DRQ_STAT, BUSY_STAT, WAIT_READY)) { + printk(KERN_ERR "%s: Strange, packet command initiated yet " + "DRQ isn't asserted\n", drive->name); + return startstop; + } + + ireason = hwif->INB(hwif->io_ports.nsect_addr); + if (drive->media == ide_tape && !drive->scsi) + ireason = ide_wait_ireason(drive, ireason); + + if ((ireason & CD) == 0 || (ireason & IO)) { + printk(KERN_ERR "%s: (IO,CoD) != (0,1) while issuing " + "a packet command\n", drive->name); + return ide_do_reset(drive); + } + + /* Set the interrupt routine */ + ide_set_handler(drive, handler, timeout, expiry); + + /* Begin DMA, if necessary */ + if (pc->flags & PC_FLAG_DMA_OK) { + pc->flags |= PC_FLAG_DMA_IN_PROGRESS; + hwif->dma_ops->dma_start(drive); + } + + /* Send the actual packet */ + if ((pc->flags & PC_FLAG_ZIP_DRIVE) == 0) + hwif->output_data(drive, NULL, pc->c, 12); + + return ide_started; +} +EXPORT_SYMBOL_GPL(ide_transfer_pc); diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index a7c138dc324..e7a1025c03c 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -532,25 +532,11 @@ static int idefloppy_transfer_pc2(ide_drive_t *drive) static ide_startstop_t idefloppy_transfer_pc1(ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; idefloppy_floppy_t *floppy = drive->driver_data; struct ide_atapi_pc *pc = floppy->pc; ide_expiry_t *expiry; unsigned int timeout; - ide_startstop_t startstop; - u8 ireason; - if (ide_wait_stat(&startstop, drive, DRQ_STAT, BUSY_STAT, WAIT_READY)) { - printk(KERN_ERR "%s: Strange, packet command initiated yet " - "DRQ isn't asserted\n", drive->name); - return startstop; - } - ireason = hwif->INB(hwif->io_ports.nsect_addr); - if ((ireason & CD) == 0 || (ireason & IO)) { - printk(KERN_ERR "%s: (IO,CoD) != (0,1) while issuing " - "a packet command\n", drive->name); - return ide_do_reset(drive); - } /* * The following delay solves a problem with ATAPI Zip 100 drives * where the Busy flag was apparently being deasserted before the @@ -567,19 +553,7 @@ static ide_startstop_t idefloppy_transfer_pc1(ide_drive_t *drive) expiry = NULL; } - ide_set_handler(drive, &idefloppy_pc_intr, timeout, expiry); - - /* Begin DMA, if necessary */ - if (pc->flags & PC_FLAG_DMA_OK) { - pc->flags |= PC_FLAG_DMA_IN_PROGRESS; - hwif->dma_ops->dma_start(drive); - } - - if ((pc->flags & PC_FLAG_ZIP_DRIVE) == 0) - /* Send the actual packet */ - hwif->output_data(drive, NULL, floppy->pc->c, 12); - - return ide_started; + return ide_transfer_pc(drive, pc, idefloppy_pc_intr, timeout, expiry); } static void ide_floppy_report_error(idefloppy_floppy_t *floppy, diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 2a362138f97..5adc2c9ae41 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -947,64 +947,12 @@ static ide_startstop_t idetape_pc_intr(ide_drive_t *drive) * again, the callback function will be called and then we will handle the next * request. */ - -static u8 ide_tape_wait_ireason(ide_drive_t *drive, u8 ireason) -{ - ide_hwif_t *hwif = drive->hwif; - int retries = 100; - - while (retries-- && ((ireason & CD) == 0 || (ireason & IO))) { - printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing " - "a packet command, retrying\n", drive->name); - udelay(100); - ireason = hwif->INB(hwif->io_ports.nsect_addr); - if (retries == 0) { - printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing " - "a packet command, ignoring\n", - drive->name); - ireason |= CD; - ireason &= ~IO; - } - } - - return ireason; -} - static ide_startstop_t idetape_transfer_pc(ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; idetape_tape_t *tape = drive->driver_data; - struct ide_atapi_pc *pc = tape->pc; - ide_startstop_t startstop; - u8 ireason; - - if (ide_wait_stat(&startstop, drive, DRQ_STAT, BUSY_STAT, WAIT_READY)) { - printk(KERN_ERR "%s: Strange, packet command initiated yet " - "DRQ isn't asserted\n", drive->name); - return startstop; - } - - ireason = hwif->INB(hwif->io_ports.nsect_addr); - ireason = ide_tape_wait_ireason(drive, ireason); - if ((ireason & CD) == 0 || (ireason & IO)) { - printk(KERN_ERR "%s: (IO,CoD) != (0,1) while issuing " - "a packet command\n", drive->name); - return ide_do_reset(drive); - } - /* Set the interrupt routine */ - ide_set_handler(drive, &idetape_pc_intr, IDETAPE_WAIT_CMD, NULL); - - /* Begin DMA, if necessary */ - if (pc->flags & PC_FLAG_DMA_OK) { - pc->flags |= PC_FLAG_DMA_IN_PROGRESS; - hwif->dma_ops->dma_start(drive); - } - - /* Send the actual packet */ - hwif->output_data(drive, NULL, pc->c, 12); - - return ide_started; + return ide_transfer_pc(drive, tape->pc, idetape_pc_intr, + IDETAPE_WAIT_CMD, NULL); } static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index c9fdf60c9dc..d41348f2245 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -453,36 +453,10 @@ static ide_startstop_t idescsi_pc_intr (ide_drive_t *drive) static ide_startstop_t idescsi_transfer_pc(ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; idescsi_scsi_t *scsi = drive_to_idescsi(drive); - struct ide_atapi_pc *pc = scsi->pc; - ide_startstop_t startstop; - u8 ireason; - - if (ide_wait_stat(&startstop,drive,DRQ_STAT,BUSY_STAT,WAIT_READY)) { - printk(KERN_ERR "%s: Strange, packet command initiated yet " - "DRQ isn't asserted\n", drive->name); - return startstop; - } - ireason = hwif->INB(hwif->io_ports.nsect_addr); - if ((ireason & CD) == 0 || (ireason & IO)) { - printk(KERN_ERR "%s: (IO,CoD) != (0,1) while issuing " - "a packet command\n", drive->name); - return ide_do_reset (drive); - } - /* Set the interrupt routine */ - ide_set_handler(drive, &idescsi_pc_intr, get_timeout(pc), idescsi_expiry); - - if (pc->flags & PC_FLAG_DMA_OK) { - pc->flags |= PC_FLAG_DMA_IN_PROGRESS; - hwif->dma_ops->dma_start(drive); - } - - /* Send the actual packet */ - hwif->output_data(drive, NULL, scsi->pc->c, 12); - - return ide_started; + return ide_transfer_pc(drive, scsi->pc, idescsi_pc_intr, + get_timeout(scsi->pc), idescsi_expiry); } static inline int idescsi_set_direction(struct ide_atapi_pc *pc) diff --git a/include/linux/ide.h b/include/linux/ide.h index 89feaea9e20..bed3c58798a 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -967,6 +967,9 @@ extern int drive_is_ready(ide_drive_t *); void ide_pktcmd_tf_load(ide_drive_t *, u32, u16, u8); +ide_startstop_t ide_transfer_pc(ide_drive_t *, struct ide_atapi_pc *, + ide_handler_t *, unsigned int, ide_expiry_t *); + ide_startstop_t do_rw_taskfile(ide_drive_t *, ide_task_t *); void task_end_request(ide_drive_t *, struct request *, u8); -- cgit v1.2.3-70-g09d2 From 28c7214bd8c2bbd4873b8f1e7f58d86d3731124f Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:21:59 +0200 Subject: ide: add PC_FLAG_DRQ_INTERRUPT pc flag Add PC_FLAG_DRQ_INTERRUPT pc flag, set it in ide*_do_request() and check for it (instead of checking for IDE*_FLAG_DRQ_INTERRUPT) in ide*_issue_pc(). This is a preparation for adding generic ide_issue_pc() helper. There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-floppy.c | 5 ++++- drivers/ide/ide-tape.c | 11 ++++++++--- drivers/scsi/ide-scsi.c | 6 +++++- include/linux/ide.h | 1 + 4 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index e7a1025c03c..13f650fa212 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -619,7 +619,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive, ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma); - if (floppy->flags & IDEFLOPPY_FLAG_DRQ_INTERRUPT) { + if (pc->flags & PC_FLAG_DRQ_INTERRUPT) { /* Issue the packet command */ ide_execute_command(drive, WIN_PACKETCMD, &idefloppy_transfer_pc1, @@ -800,6 +800,9 @@ static ide_startstop_t idefloppy_do_request(ide_drive_t *drive, return ide_stopped; } + if (floppy->flags & IDEFLOPPY_FLAG_DRQ_INTERRUPT) + pc->flags |= PC_FLAG_DRQ_INTERRUPT; + if (floppy->flags & IDEFLOPPY_FLAG_ZIP_DRIVE) pc->flags |= PC_FLAG_ZIP_DRIVE; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 5adc2c9ae41..cba18a67550 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -1020,7 +1020,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma_ok); - if (test_bit(IDETAPE_FLAG_DRQ_INTERRUPT, &tape->flags)) { + if (pc->flags & PC_FLAG_DRQ_INTERRUPT) { ide_execute_command(drive, WIN_PACKETCMD, &idetape_transfer_pc, IDETAPE_WAIT_CMD, NULL); return ide_started; @@ -1143,8 +1143,10 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, } /* Retry a failed packet command */ - if (tape->failed_pc && tape->pc->c[0] == REQUEST_SENSE) - return idetape_issue_pc(drive, tape->failed_pc); + if (tape->failed_pc && tape->pc->c[0] == REQUEST_SENSE) { + pc = tape->failed_pc; + goto out; + } if (postponed_rq != NULL) if (rq != postponed_rq) { @@ -1216,6 +1218,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, } BUG(); out: + if (test_bit(IDETAPE_FLAG_DRQ_INTERRUPT, &tape->flags)) + pc->flags |= PC_FLAG_DRQ_INTERRUPT; + return idetape_issue_pc(drive, pc); } diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index 1d261298d61..b7c5e839157 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -525,7 +525,7 @@ static ide_startstop_t idescsi_issue_pc(ide_drive_t *drive, ide_pktcmd_tf_load(drive, 0, bcount, dma); - if (test_bit(IDESCSI_DRQ_INTERRUPT, &scsi->flags)) { + if (pc->flags & PC_FLAG_DRQ_INTERRUPT) { ide_execute_command(drive, WIN_PACKETCMD, &idescsi_transfer_pc, get_timeout(pc), idescsi_expiry); return ide_started; @@ -548,6 +548,10 @@ static ide_startstop_t idescsi_do_request (ide_drive_t *drive, struct request *r if (blk_sense_request(rq) || blk_special_request(rq)) { struct ide_atapi_pc *pc = (struct ide_atapi_pc *)rq->special; + idescsi_scsi_t *scsi = drive_to_idescsi(drive); + + if (test_bit(IDESCSI_DRQ_INTERRUPT, &scsi->flags)) + pc->flags |= PC_FLAG_DRQ_INTERRUPT; if (drive->using_dma && !idescsi_map_sg(drive, pc)) pc->flags |= PC_FLAG_DMA_OK; diff --git a/include/linux/ide.h b/include/linux/ide.h index bed3c58798a..c2274ad44b2 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -608,6 +608,7 @@ enum { /* command timed out */ PC_FLAG_TIMEDOUT = (1 << 7), PC_FLAG_ZIP_DRIVE = (1 << 8), + PC_FLAG_DRQ_INTERRUPT = (1 << 9), }; struct ide_atapi_pc { -- cgit v1.2.3-70-g09d2 From 6bf1641ca1c7554f0da54aaf89788731b541bacc Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:22:00 +0200 Subject: ide: add ide_issue_pc() helper Add generic ide_issue_pc() helper to ide-atapi.c and then convert ide-{floppy,tape,scsi} device drivers to use it. There should be no functional changes caused by this patch. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-atapi.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/ide/ide-floppy.c | 35 ++-------------------------------- drivers/ide/ide-tape.c | 30 ++--------------------------- drivers/scsi/ide-scsi.c | 30 ++--------------------------- include/linux/ide.h | 2 ++ 5 files changed, 57 insertions(+), 89 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 25939bc6040..932a83abaf0 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -68,3 +68,52 @@ ide_startstop_t ide_transfer_pc(ide_drive_t *drive, struct ide_atapi_pc *pc, return ide_started; } EXPORT_SYMBOL_GPL(ide_transfer_pc); + +ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_atapi_pc *pc, + ide_handler_t *handler, unsigned int timeout, + ide_expiry_t *expiry) +{ + ide_hwif_t *hwif = drive->hwif; + u16 bcount; + u8 dma = 0; + + /* We haven't transferred any data yet */ + pc->xferred = 0; + pc->cur_pos = pc->buf; + + /* Request to transfer the entire buffer at once */ + if (drive->media == ide_tape && !drive->scsi) + bcount = pc->req_xfer; + else + bcount = min(pc->req_xfer, 63 * 1024); + + if (pc->flags & PC_FLAG_DMA_ERROR) { + pc->flags &= ~PC_FLAG_DMA_ERROR; + ide_dma_off(drive); + } + + if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma) { + if (drive->scsi) + hwif->sg_mapped = 1; + dma = !hwif->dma_ops->dma_setup(drive); + if (drive->scsi) + hwif->sg_mapped = 0; + } + + if (!dma) + pc->flags &= ~PC_FLAG_DMA_OK; + + ide_pktcmd_tf_load(drive, drive->scsi ? 0 : IDE_TFLAG_OUT_DEVICE, + bcount, dma); + + /* Issue the packet command */ + if (pc->flags & PC_FLAG_DRQ_INTERRUPT) { + ide_execute_command(drive, WIN_PACKETCMD, handler, + timeout, NULL); + return ide_started; + } else { + ide_execute_pkt_cmd(drive); + return (*handler)(drive); + } +} +EXPORT_SYMBOL_GPL(ide_issue_pc); diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 13f650fa212..e658aafc51d 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -576,9 +576,6 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive, struct ide_atapi_pc *pc) { idefloppy_floppy_t *floppy = drive->driver_data; - ide_hwif_t *hwif = drive->hwif; - u16 bcount; - u8 dma; if (floppy->failed_pc == NULL && pc->c[0] != GPCMD_REQUEST_SENSE) @@ -600,37 +597,9 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive, debug_log("Retry number - %d\n", pc->retries); pc->retries++; - /* We haven't transferred any data yet */ - pc->xferred = 0; - pc->cur_pos = pc->buf; - bcount = min(pc->req_xfer, 63 * 1024); - - if (pc->flags & PC_FLAG_DMA_ERROR) { - pc->flags &= ~PC_FLAG_DMA_ERROR; - ide_dma_off(drive); - } - dma = 0; - if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma) - dma = !hwif->dma_ops->dma_setup(drive); - - if (!dma) - pc->flags &= ~PC_FLAG_DMA_OK; - - ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma); - - if (pc->flags & PC_FLAG_DRQ_INTERRUPT) { - /* Issue the packet command */ - ide_execute_command(drive, WIN_PACKETCMD, - &idefloppy_transfer_pc1, - IDEFLOPPY_WAIT_CMD, - NULL); - return ide_started; - } else { - /* Issue the packet command */ - ide_execute_pkt_cmd(drive); - return idefloppy_transfer_pc1(drive); - } + return ide_issue_pc(drive, pc, idefloppy_transfer_pc1, + IDEFLOPPY_WAIT_CMD, NULL); } static void idefloppy_create_prevent_cmd(struct ide_atapi_pc *pc, int prevent) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index cba18a67550..7907a1e4191 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -958,10 +958,7 @@ static ide_startstop_t idetape_transfer_pc(ide_drive_t *drive) static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, struct ide_atapi_pc *pc) { - ide_hwif_t *hwif = drive->hwif; idetape_tape_t *tape = drive->driver_data; - int dma_ok = 0; - u16 bcount; if (tape->pc->c[0] == REQUEST_SENSE && pc->c[0] == REQUEST_SENSE) { @@ -1002,32 +999,9 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive, debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]); pc->retries++; - /* We haven't transferred any data yet */ - pc->xferred = 0; - pc->cur_pos = pc->buf; - /* Request to transfer the entire buffer at once */ - bcount = pc->req_xfer; - - if (pc->flags & PC_FLAG_DMA_ERROR) { - pc->flags &= ~PC_FLAG_DMA_ERROR; - ide_dma_off(drive); - } - if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma) - dma_ok = !hwif->dma_ops->dma_setup(drive); - - if (!dma_ok) - pc->flags &= ~PC_FLAG_DMA_OK; - - ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma_ok); - if (pc->flags & PC_FLAG_DRQ_INTERRUPT) { - ide_execute_command(drive, WIN_PACKETCMD, &idetape_transfer_pc, - IDETAPE_WAIT_CMD, NULL); - return ide_started; - } else { - ide_execute_pkt_cmd(drive); - return idetape_transfer_pc(drive); - } + return ide_issue_pc(drive, pc, idetape_transfer_pc, + IDETAPE_WAIT_CMD, NULL); } /* A mode sense command is used to "sense" tape parameters. */ diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index b7c5e839157..32415466fbf 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -502,38 +502,12 @@ static ide_startstop_t idescsi_issue_pc(ide_drive_t *drive, struct ide_atapi_pc *pc) { idescsi_scsi_t *scsi = drive_to_idescsi(drive); - ide_hwif_t *hwif = drive->hwif; - u16 bcount; - u8 dma = 0; /* Set the current packet command */ scsi->pc = pc; - /* We haven't transferred any data yet */ - pc->xferred = 0; - pc->cur_pos = pc->buf; - /* Request to transfer the entire buffer at once */ - bcount = min(pc->req_xfer, 63 * 1024); - - if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma) { - hwif->sg_mapped = 1; - dma = !hwif->dma_ops->dma_setup(drive); - hwif->sg_mapped = 0; - } - - if (!dma) - pc->flags &= ~PC_FLAG_DMA_OK; - ide_pktcmd_tf_load(drive, 0, bcount, dma); - - if (pc->flags & PC_FLAG_DRQ_INTERRUPT) { - ide_execute_command(drive, WIN_PACKETCMD, &idescsi_transfer_pc, - get_timeout(pc), idescsi_expiry); - return ide_started; - } else { - /* Issue the packet command */ - ide_execute_pkt_cmd(drive); - return idescsi_transfer_pc(drive); - } + return ide_issue_pc(drive, pc, idescsi_transfer_pc, + get_timeout(pc), idescsi_expiry); } /* diff --git a/include/linux/ide.h b/include/linux/ide.h index c2274ad44b2..fee07a7edb1 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -970,6 +970,8 @@ void ide_pktcmd_tf_load(ide_drive_t *, u32, u16, u8); ide_startstop_t ide_transfer_pc(ide_drive_t *, struct ide_atapi_pc *, ide_handler_t *, unsigned int, ide_expiry_t *); +ide_startstop_t ide_issue_pc(ide_drive_t *, struct ide_atapi_pc *, + ide_handler_t *, unsigned int, ide_expiry_t *); ide_startstop_t do_rw_taskfile(ide_drive_t *, ide_task_t *); -- cgit v1.2.3-70-g09d2 From 646c0cb6c430f8d3ad3769dd1518fe664ff0ce27 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 15 Jul 2008 21:22:03 +0200 Subject: ide: add ide_pc_intr() helper * ide-tape.c: add 'drive' argument to idetape_update_buffers(). * Add generic ide_pc_intr() helper to ide-atapi.c and then convert ide-{floppy,tape,scsi} device drivers to use it. * ide-tape.c: remove no longer needed DBG_PC_INTR. There should be no functional changes caused by this patch (unless the debugging is explicitely compiled in). Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-atapi.c | 177 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/ide/ide-floppy.c | 128 +--------------------------------- drivers/ide/ide-tape.c | 132 ++--------------------------------- drivers/scsi/ide-scsi.c | 115 +----------------------------- include/linux/ide.h | 6 ++ 5 files changed, 195 insertions(+), 363 deletions(-) (limited to 'include/linux') diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 932a83abaf0..2802031de67 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -5,6 +5,183 @@ #include #include #include +#include + +#ifdef DEBUG +#define debug_log(fmt, args...) \ + printk(KERN_INFO "ide: " fmt, ## args) +#else +#define debug_log(fmt, args...) do {} while (0) +#endif + +/* TODO: unify the code thus making some arguments go away */ +ide_startstop_t ide_pc_intr(ide_drive_t *drive, struct ide_atapi_pc *pc, + ide_handler_t *handler, unsigned int timeout, ide_expiry_t *expiry, + void (*update_buffers)(ide_drive_t *, struct ide_atapi_pc *), + void (*retry_pc)(ide_drive_t *), void (*dsc_handle)(ide_drive_t *), + void (*io_buffers)(ide_drive_t *, struct ide_atapi_pc *, unsigned, int)) +{ + ide_hwif_t *hwif = drive->hwif; + xfer_func_t *xferfunc; + unsigned int temp; + u16 bcount; + u8 stat, ireason, scsi = drive->scsi; + + debug_log("Enter %s - interrupt handler\n", __func__); + + if (pc->flags & PC_FLAG_TIMEDOUT) { + pc->callback(drive); + return ide_stopped; + } + + /* Clear the interrupt */ + stat = ide_read_status(drive); + + if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) { + if (hwif->dma_ops->dma_end(drive) || + (drive->media == ide_tape && !scsi && (stat & ERR_STAT))) { + if (drive->media == ide_floppy && !scsi) + printk(KERN_ERR "%s: DMA %s error\n", + drive->name, rq_data_dir(pc->rq) + ? "write" : "read"); + pc->flags |= PC_FLAG_DMA_ERROR; + } else { + pc->xferred = pc->req_xfer; + if (update_buffers) + update_buffers(drive, pc); + } + debug_log("%s: DMA finished\n", drive->name); + } + + /* No more interrupts */ + if ((stat & DRQ_STAT) == 0) { + debug_log("Packet command completed, %d bytes transferred\n", + pc->xferred); + + pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS; + + local_irq_enable_in_hardirq(); + + if (drive->media == ide_tape && !scsi && + (stat & ERR_STAT) && pc->c[0] == REQUEST_SENSE) + stat &= ~ERR_STAT; + if ((stat & ERR_STAT) || (pc->flags & PC_FLAG_DMA_ERROR)) { + /* Error detected */ + debug_log("%s: I/O error\n", drive->name); + + if (drive->media != ide_tape || scsi) { + pc->rq->errors++; + if (scsi) + goto cmd_finished; + } + + if (pc->c[0] == REQUEST_SENSE) { + printk(KERN_ERR "%s: I/O error in request sense" + " command\n", drive->name); + return ide_do_reset(drive); + } + + debug_log("[cmd %x]: check condition\n", pc->c[0]); + + /* Retry operation */ + retry_pc(drive); + /* queued, but not started */ + return ide_stopped; + } +cmd_finished: + pc->error = 0; + if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && + (stat & SEEK_STAT) == 0) { + dsc_handle(drive); + return ide_stopped; + } + /* Command finished - Call the callback function */ + pc->callback(drive); + return ide_stopped; + } + + if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) { + pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS; + printk(KERN_ERR "%s: The device wants to issue more interrupts " + "in DMA mode\n", drive->name); + ide_dma_off(drive); + return ide_do_reset(drive); + } + /* Get the number of bytes to transfer on this interrupt. */ + bcount = (hwif->INB(hwif->io_ports.lbah_addr) << 8) | + hwif->INB(hwif->io_ports.lbam_addr); + + ireason = hwif->INB(hwif->io_ports.nsect_addr); + + if (ireason & CD) { + printk(KERN_ERR "%s: CoD != 0 in %s\n", drive->name, __func__); + return ide_do_reset(drive); + } + if (((ireason & IO) == IO) == !!(pc->flags & PC_FLAG_WRITING)) { + /* Hopefully, we will never get here */ + printk(KERN_ERR "%s: We wanted to %s, but the device wants us " + "to %s!\n", drive->name, + (ireason & IO) ? "Write" : "Read", + (ireason & IO) ? "Read" : "Write"); + return ide_do_reset(drive); + } + if (!(pc->flags & PC_FLAG_WRITING)) { + /* Reading - Check that we have enough space */ + temp = pc->xferred + bcount; + if (temp > pc->req_xfer) { + if (temp > pc->buf_size) { + printk(KERN_ERR "%s: The device wants to send " + "us more data than expected - " + "discarding data\n", + drive->name); + if (scsi) + temp = pc->buf_size - pc->xferred; + else + temp = 0; + if (temp) { + if (pc->sg) + io_buffers(drive, pc, temp, 0); + else + hwif->input_data(drive, NULL, + pc->cur_pos, temp); + printk(KERN_ERR "%s: transferred %d of " + "%d bytes\n", + drive->name, + temp, bcount); + } + pc->xferred += temp; + pc->cur_pos += temp; + ide_pad_transfer(drive, 0, bcount - temp); + ide_set_handler(drive, handler, timeout, + expiry); + return ide_started; + } + debug_log("The device wants to send us more data than " + "expected - allowing transfer\n"); + } + xferfunc = hwif->input_data; + } else + xferfunc = hwif->output_data; + + if ((drive->media == ide_floppy && !scsi && !pc->buf) || + (drive->media == ide_tape && !scsi && pc->bh) || + (scsi && pc->sg)) + io_buffers(drive, pc, bcount, !!(pc->flags & PC_FLAG_WRITING)); + else + xferfunc(drive, NULL, pc->cur_pos, bcount); + + /* Update the current position */ + pc->xferred += bcount; + pc->cur_pos += bcount; + + debug_log("[cmd %x] transferred %d bytes on that intr.\n", + pc->c[0], bcount); + + /* And set the interrupt handler again */ + ide_set_handler(drive, handler, timeout, expiry); + return ide_started; +} +EXPORT_SYMBOL_GPL(ide_pc_intr); static u8 ide_wait_ireason(ide_drive_t *drive, u8 ireason) { diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 70aef97fb8b..0f3602a5efb 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -388,132 +388,10 @@ static void idefloppy_retry_pc(ide_drive_t *drive) static ide_startstop_t idefloppy_pc_intr(ide_drive_t *drive) { idefloppy_floppy_t *floppy = drive->driver_data; - ide_hwif_t *hwif = drive->hwif; - struct ide_atapi_pc *pc = floppy->pc; - struct request *rq = pc->rq; - xfer_func_t *xferfunc; - unsigned int temp; - int dma_error = 0; - u16 bcount; - u8 stat, ireason; - - debug_log("Enter %s - interrupt handler\n", __func__); - - /* Clear the interrupt */ - stat = ide_read_status(drive); - - if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) { - dma_error = hwif->dma_ops->dma_end(drive); - if (dma_error) { - printk(KERN_ERR "%s: DMA %s error\n", drive->name, - rq_data_dir(rq) ? "write" : "read"); - pc->flags |= PC_FLAG_DMA_ERROR; - } else { - pc->xferred = pc->req_xfer; - idefloppy_update_buffers(drive, pc); - } - debug_log("%s: DMA finished\n", drive->name); - } - - /* No more interrupts */ - if ((stat & DRQ_STAT) == 0) { - debug_log("Packet command completed, %d bytes transferred\n", - pc->xferred); - pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS; - - local_irq_enable_in_hardirq(); - - if ((stat & ERR_STAT) || (pc->flags & PC_FLAG_DMA_ERROR)) { - /* Error detected */ - debug_log("%s: I/O error\n", drive->name); - rq->errors++; - if (pc->c[0] == GPCMD_REQUEST_SENSE) { - printk(KERN_ERR "%s: I/O error in request sense" - " command\n", drive->name); - return ide_do_reset(drive); - } - - debug_log("[cmd %x]: check condition\n", pc->c[0]); - - /* Retry operation */ - idefloppy_retry_pc(drive); - /* queued, but not started */ - return ide_stopped; - } - pc->error = 0; - /* Command finished - Call the callback function */ - pc->callback(drive); - return ide_stopped; - } - - if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) { - pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS; - printk(KERN_ERR "%s: The device wants to issue more interrupts " - "in DMA mode\n", drive->name); - ide_dma_off(drive); - return ide_do_reset(drive); - } - - /* Get the number of bytes to transfer */ - bcount = (hwif->INB(hwif->io_ports.lbah_addr) << 8) | - hwif->INB(hwif->io_ports.lbam_addr); - /* on this interrupt */ - ireason = hwif->INB(hwif->io_ports.nsect_addr); - - if (ireason & CD) { - printk(KERN_ERR "%s: CoD != 0 in %s\n", drive->name, __func__); - return ide_do_reset(drive); - } - if (((ireason & IO) == IO) == !!(pc->flags & PC_FLAG_WRITING)) { - /* Hopefully, we will never get here */ - printk(KERN_ERR "%s: We wanted to %s, but the device wants us " - "to %s!\n", drive->name, - (ireason & IO) ? "Write" : "Read", - (ireason & IO) ? "Read" : "Write"); - return ide_do_reset(drive); - } - if (!(pc->flags & PC_FLAG_WRITING)) { - /* Reading - Check that we have enough space */ - temp = pc->xferred + bcount; - if (temp > pc->req_xfer) { - if (temp > pc->buf_size) { - printk(KERN_ERR "%s: The device wants to send " - "us more data than expected - " - "discarding data\n", - drive->name); - ide_pad_transfer(drive, 0, bcount); - - ide_set_handler(drive, - &idefloppy_pc_intr, - IDEFLOPPY_WAIT_CMD, - NULL); - return ide_started; - } - debug_log("The device wants to send us more data than " - "expected - allowing transfer\n"); - } - } - if (pc->flags & PC_FLAG_WRITING) - xferfunc = hwif->output_data; - else - xferfunc = hwif->input_data; - - if (pc->buf) - xferfunc(drive, NULL, pc->cur_pos, bcount); - else - ide_floppy_io_buffers(drive, pc, bcount, - !!(pc->flags & PC_FLAG_WRITING)); - - /* Update the current position */ - pc->xferred += bcount; - pc->cur_pos += bcount; - - debug_log("[cmd %x] transferred %d bytes on that intr.\n", - pc->c[0], bcount); - /* And set the interrupt handler again */ - ide_set_handler(drive, &idefloppy_pc_intr, IDEFLOPPY_WAIT_CMD, NULL); - return ide_started; + return ide_pc_intr(drive, floppy->pc, idefloppy_pc_intr, + IDEFLOPPY_WAIT_CMD, NULL, idefloppy_update_buffers, + idefloppy_retry_pc, NULL, ide_floppy_io_buffers); } /* diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 10f2d333628..0afa109ec99 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -56,8 +56,6 @@ enum { DBG_PROCS = (1 << 3), /* buffer alloc info (pc_stack & rq_stack) */ DBG_PCRQ_STACK = (1 << 4), - /* IRQ handler (always log debug info if debugging is on) */ - DBG_PC_INTR = (1 << 5), }; /* define to see debug info */ @@ -66,7 +64,7 @@ enum { #if IDETAPE_DEBUG_LOG #define debug_log(lvl, fmt, args...) \ { \ - if ((lvl & DBG_PC_INTR) || (tape->debug_mask & lvl)) \ + if (tape->debug_mask & lvl) \ printk(KERN_INFO "ide-tape: " fmt, ## args); \ } #else @@ -441,7 +439,7 @@ static void idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, } } -static void idetape_update_buffers(struct ide_atapi_pc *pc) +static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc) { struct idetape_bh *bh = pc->bh; int count; @@ -526,7 +524,7 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense) pc->xferred = pc->req_xfer - tape->blk_size * get_unaligned_be32(&sense[3]); - idetape_update_buffers(pc); + idetape_update_buffers(drive, pc); } /* @@ -800,129 +798,11 @@ static void ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, */ static ide_startstop_t idetape_pc_intr(ide_drive_t *drive) { - ide_hwif_t *hwif = drive->hwif; idetape_tape_t *tape = drive->driver_data; - struct ide_atapi_pc *pc = tape->pc; - xfer_func_t *xferfunc; - unsigned int temp; - u16 bcount; - u8 stat, ireason; - - debug_log(DBG_PC_INTR, "Enter %s - interrupt handler\n", __func__); - - /* Clear the interrupt */ - stat = ide_read_status(drive); - - if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) { - if (hwif->dma_ops->dma_end(drive) || (stat & ERR_STAT)) { - pc->flags |= PC_FLAG_DMA_ERROR; - } else { - pc->xferred = pc->req_xfer; - idetape_update_buffers(pc); - } - debug_log(DBG_PC_INTR, "%s: DMA finished\n", drive->name); - } - - /* No more interrupts */ - if ((stat & DRQ_STAT) == 0) { - debug_log(DBG_PC_INTR, "Packet command completed, %d bytes" - " transferred\n", pc->xferred); - - pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS; - local_irq_enable_in_hardirq(); - - if ((stat & ERR_STAT) && pc->c[0] == REQUEST_SENSE) - stat &= ~ERR_STAT; - if ((stat & ERR_STAT) || (pc->flags & PC_FLAG_DMA_ERROR)) { - /* Error detected */ - debug_log(DBG_PC_INTR, "%s: I/O error\n", drive->name); - - if (pc->c[0] == REQUEST_SENSE) { - printk(KERN_ERR "%s: I/O error in request sense" - " command\n", drive->name); - return ide_do_reset(drive); - } - debug_log(DBG_PC_INTR, "[cmd %x]: check condition\n", - pc->c[0]); - - /* Retry operation */ - idetape_retry_pc(drive); - return ide_stopped; - } - pc->error = 0; - if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && - (stat & SEEK_STAT) == 0) { - ide_tape_handle_dsc(drive); - return ide_stopped; - } - /* Command finished - Call the callback function */ - pc->callback(drive); - return ide_stopped; - } - - if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) { - pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS; - printk(KERN_ERR "%s: The device wants to issue more interrupts " - "in DMA mode\n", drive->name); - ide_dma_off(drive); - return ide_do_reset(drive); - } - /* Get the number of bytes to transfer on this interrupt. */ - bcount = (hwif->INB(hwif->io_ports.lbah_addr) << 8) | - hwif->INB(hwif->io_ports.lbam_addr); - - ireason = hwif->INB(hwif->io_ports.nsect_addr); - - if (ireason & CD) { - printk(KERN_ERR "%s: CoD != 0 in %s\n", drive->name, __func__); - return ide_do_reset(drive); - } - if (((ireason & IO) == IO) == !!(pc->flags & PC_FLAG_WRITING)) { - /* Hopefully, we will never get here */ - printk(KERN_ERR "%s: We wanted to %s, but the device wants us " - "to %s!\n", drive->name, - (ireason & IO) ? "Write" : "Read", - (ireason & IO) ? "Read" : "Write"); - return ide_do_reset(drive); - } - if (!(pc->flags & PC_FLAG_WRITING)) { - /* Reading - Check that we have enough space */ - temp = pc->xferred + bcount; - if (temp > pc->req_xfer) { - if (temp > pc->buf_size) { - printk(KERN_ERR "%s: The device wants to send " - "us more data than expected - " - "discarding data\n", - drive->name); - ide_pad_transfer(drive, 0, bcount); - ide_set_handler(drive, &idetape_pc_intr, - IDETAPE_WAIT_CMD, NULL); - return ide_started; - } - debug_log(DBG_PC_INTR, "The device wants to send us more " - "data than expected - allowing transfer\n"); - } - xferfunc = hwif->input_data; - } else { - xferfunc = hwif->output_data; - } - - if (pc->bh) - ide_tape_io_buffers(drive, pc, bcount, - !!(pc->flags & PC_FLAG_WRITING)); - else - xferfunc(drive, NULL, pc->cur_pos, bcount); - - /* Update the current position */ - pc->xferred += bcount; - pc->cur_pos += bcount; - - debug_log(DBG_PC_INTR, "[cmd %x] transferred %d bytes on that intr.\n", - pc->c[0], bcount); - /* And set the interrupt handler again */ - ide_set_handler(drive, &idetape_pc_intr, IDETAPE_WAIT_CMD, NULL); - return ide_started; + return ide_pc_intr(drive, tape->pc, idetape_pc_intr, IDETAPE_WAIT_CMD, + NULL, idetape_update_buffers, idetape_retry_pc, + ide_tape_handle_dsc, ide_tape_io_buffers); } /* diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index ada733ca672..683bce375c7 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -356,120 +356,11 @@ static int idescsi_expiry(ide_drive_t *drive) static ide_startstop_t idescsi_pc_intr (ide_drive_t *drive) { idescsi_scsi_t *scsi = drive_to_idescsi(drive); - ide_hwif_t *hwif = drive->hwif; struct ide_atapi_pc *pc = scsi->pc; - struct request *rq = pc->rq; - xfer_func_t *xferfunc; - unsigned int temp; - u16 bcount; - u8 stat, ireason; - - debug_log("Enter %s - interrupt handler\n", __func__); - - if (pc->flags & PC_FLAG_TIMEDOUT) { - pc->callback(drive); - return ide_stopped; - } - - /* Clear the interrupt */ - stat = ide_read_status(drive); - - if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) { - if (hwif->dma_ops->dma_end(drive)) - pc->flags |= PC_FLAG_DMA_ERROR; - else - pc->xferred = pc->req_xfer; - debug_log("%s: DMA finished\n", drive->name); - } - - if ((stat & DRQ_STAT) == 0) { - /* No more interrupts */ - debug_log("Packet command completed, %d bytes transferred\n", - pc->xferred); - pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS; - local_irq_enable_in_hardirq(); - if ((stat & ERR_STAT) || (pc->flags & PC_FLAG_DMA_ERROR)) { - /* Error detected */ - debug_log("%s: I/O error\n", drive->name); - - rq->errors++; - } - pc->callback(drive); - return ide_stopped; - } - if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) { - pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS; - printk(KERN_ERR "%s: The device wants to issue more interrupts " - "in DMA mode\n", drive->name); - ide_dma_off(drive); - return ide_do_reset(drive); - } - bcount = (hwif->INB(hwif->io_ports.lbah_addr) << 8) | - hwif->INB(hwif->io_ports.lbam_addr); - ireason = hwif->INB(hwif->io_ports.nsect_addr); - - if (ireason & CD) { - printk(KERN_ERR "%s: CoD != 0 in %s\n", drive->name, __func__); - return ide_do_reset (drive); - } - if (((ireason & IO) == IO) == !!(pc->flags & PC_FLAG_WRITING)) { - /* Hopefully, we will never get here */ - printk(KERN_ERR "%s: We wanted to %s, but the device wants us " - "to %s!\n", drive->name, - (ireason & IO) ? "Write" : "Read", - (ireason & IO) ? "Read" : "Write"); - return ide_do_reset(drive); - } - if (!(pc->flags & PC_FLAG_WRITING)) { - temp = pc->xferred + bcount; - if (temp > pc->req_xfer) { - if (temp > pc->buf_size) { - printk(KERN_ERR "%s: The device wants to send " - "us more data than expected - " - "discarding data\n", - drive->name); - temp = pc->buf_size - pc->xferred; - if (temp) { - if (pc->sg) - ide_scsi_io_buffers(drive, pc, - temp, 0); - else - hwif->input_data(drive, NULL, - pc->cur_pos, temp); - printk(KERN_ERR "%s: transferred %d of " - "%d bytes\n", - drive->name, - temp, bcount); - } - pc->xferred += temp; - pc->cur_pos += temp; - ide_pad_transfer(drive, 0, bcount - temp); - ide_set_handler(drive, &idescsi_pc_intr, get_timeout(pc), idescsi_expiry); - return ide_started; - } - debug_log("The device wants to send us more data than " - "expected - allowing transfer\n"); - } - xferfunc = hwif->input_data; - } else - xferfunc = hwif->output_data; - - if (pc->sg) - ide_scsi_io_buffers(drive, pc, bcount, - !!(pc->flags & PC_FLAG_WRITING)); - else - xferfunc(drive, NULL, pc->cur_pos, bcount); - - /* Update the current position */ - pc->xferred += bcount; - pc->cur_pos += bcount; - - debug_log("[cmd %x] transferred %d bytes on that intr.\n", - pc->c[0], bcount); - /* And set the interrupt handler again */ - ide_set_handler(drive, &idescsi_pc_intr, get_timeout(pc), idescsi_expiry); - return ide_started; + return ide_pc_intr(drive, pc, idescsi_pc_intr, get_timeout(pc), + idescsi_expiry, NULL, NULL, NULL, + ide_scsi_io_buffers); } static ide_startstop_t idescsi_transfer_pc(ide_drive_t *drive) diff --git a/include/linux/ide.h b/include/linux/ide.h index fee07a7edb1..ac4eeb2932e 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -968,6 +968,12 @@ extern int drive_is_ready(ide_drive_t *); void ide_pktcmd_tf_load(ide_drive_t *, u32, u16, u8); +ide_startstop_t ide_pc_intr(ide_drive_t *drive, struct ide_atapi_pc *pc, + ide_handler_t *handler, unsigned int timeout, ide_expiry_t *expiry, + void (*update_buffers)(ide_drive_t *, struct ide_atapi_pc *), + void (*retry_pc)(ide_drive_t *), void (*dsc_handle)(ide_drive_t *), + void (*io_buffers)(ide_drive_t *, struct ide_atapi_pc *, unsigned int, + int)); ide_startstop_t ide_transfer_pc(ide_drive_t *, struct ide_atapi_pc *, ide_handler_t *, unsigned int, ide_expiry_t *); ide_startstop_t ide_issue_pc(ide_drive_t *, struct ide_atapi_pc *, -- cgit v1.2.3-70-g09d2