diff options
Diffstat (limited to 'kernel')
37 files changed, 5564 insertions, 2915 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index dcf6789bf54..9943202b435 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -4,7 +4,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o capability.o ptrace.o timer.o user.o \ + sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ @@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o @@ -97,6 +98,7 @@ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b5cb469d254..3cf2183b472 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ -/* FIXME: see the FIXME in partition_sched_domains() */ -static int generate_sched_domains(struct cpumask **domains, +static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { LIST_HEAD(q); /* queue of cpusets to be scanned */ @@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains, struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ - struct cpumask *doms; /* resulting partition; i.e. sched domains */ + cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains, /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - doms = kmalloc(cpumask_size(), GFP_KERNEL); + ndoms = 1; + doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms, top_cpuset.cpus_allowed); + cpumask_copy(doms[0], top_cpuset.cpus_allowed); - ndoms = 1; goto done; } @@ -636,7 +635,7 @@ restart: * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ - doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); + doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -656,7 +655,7 @@ restart: continue; } - dp = doms + nslot; + dp = doms[nslot]; if (nslot == ndoms) { static int warnings = 10; @@ -718,7 +717,7 @@ done: static void do_rebuild_sched_domains(struct work_struct *unused) { struct sched_domain_attr *attr; - struct cpumask *doms; + cpumask_var_t *doms; int ndoms; get_online_cpus(); @@ -2052,7 +2051,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { struct sched_domain_attr *attr; - struct cpumask *doms; + cpumask_var_t *doms; int ndoms; switch (phase) { @@ -2537,15 +2536,9 @@ const struct file_operations proc_cpuset_operations = { }; #endif /* CONFIG_PROC_PID_CPUSET */ -/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ +/* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { - seq_printf(m, "Cpus_allowed:\t"); - seq_cpumask(m, &task->cpus_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Cpus_allowed_list:\t"); - seq_cpumask_list(m, &task->cpus_allowed); - seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); seq_nodemask(m, &task->mems_allowed); seq_printf(m, "\n"); diff --git a/kernel/exit.c b/kernel/exit.c index f7864ac2ecc..80ae941cfd2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -49,6 +49,7 @@ #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> +#include <linux/hw_breakpoint.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -110,9 +111,9 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, task_utime(tsk)); - sig->stime = cputime_add(sig->stime, task_stime(tsk)); - sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); + sig->utime = cputime_add(sig->utime, tsk->utime); + sig->stime = cputime_add(sig->stime, tsk->stime); + sig->gtime = cputime_add(sig->gtime, tsk->gtime); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -978,6 +979,10 @@ NORET_TYPE void do_exit(long code) proc_exit_connector(tsk); /* + * FIXME: do that only when needed, using sched_exit tracepoint + */ + flush_ptrace_hw_breakpoint(tsk); + /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ @@ -1205,6 +1210,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) struct signal_struct *psig; struct signal_struct *sig; unsigned long maxrss; + cputime_t tgutime, tgstime; /* * The resource counters for the group leader are in its @@ -1220,20 +1226,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * need to protect the access to parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. + * + * We use thread_group_times() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. */ + thread_group_times(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; psig->cutime = cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); + cputime_add(tgutime, + sig->cutime)); psig->cstime = cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); + cputime_add(tgstime, + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, diff --git a/kernel/fork.c b/kernel/fork.c index 166b8c49257..3d6f121bbe8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -884,6 +884,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + sig->prev_utime = sig->prev_stime = cputime_zero; +#endif sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; @@ -1066,8 +1069,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->gtime = cputime_zero; p->utimescaled = cputime_zero; p->stimescaled = cputime_zero; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; +#endif p->default_timer_slack_ns = current->timer_slack_ns; diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c new file mode 100644 index 00000000000..cf5ee162841 --- /dev/null +++ b/kernel/hw_breakpoint.c @@ -0,0 +1,423 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) IBM Corporation, 2009 + * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> + * + * Thanks to Ingo Molnar for his many suggestions. + * + * Authors: Alan Stern <stern@rowland.harvard.edu> + * K.Prasad <prasad@linux.vnet.ibm.com> + * Frederic Weisbecker <fweisbec@gmail.com> + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + * This file contains the arch-independent routines. + */ + +#include <linux/irqflags.h> +#include <linux/kallsyms.h> +#include <linux/notifier.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <linux/hw_breakpoint.h> + +/* + * Constraints data + */ + +/* Number of pinned cpu breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); + +/* Number of pinned task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]); + +/* Number of non-pinned cpu/task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); + +/* Gather the number of total pinned and un-pinned bp in a cpuset */ +struct bp_busy_slots { + unsigned int pinned; + unsigned int flexible; +}; + +/* Serialize accesses to the above constraints */ +static DEFINE_MUTEX(nr_bp_mutex); + +/* + * Report the maximum number of pinned breakpoints a task + * have in this cpu + */ +static unsigned int max_task_bp_pinned(int cpu) +{ + int i; + unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu); + + for (i = HBP_NUM -1; i >= 0; i--) { + if (tsk_pinned[i] > 0) + return i + 1; + } + + return 0; +} + +/* + * Report the number of pinned/un-pinned breakpoints we have in + * a given cpu (cpu > -1) or in all of them (cpu = -1). + */ +static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) +{ + if (cpu >= 0) { + slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); + slots->pinned += max_task_bp_pinned(cpu); + slots->flexible = per_cpu(nr_bp_flexible, cpu); + + return; + } + + for_each_online_cpu(cpu) { + unsigned int nr; + + nr = per_cpu(nr_cpu_bp_pinned, cpu); + nr += max_task_bp_pinned(cpu); + + if (nr > slots->pinned) + slots->pinned = nr; + + nr = per_cpu(nr_bp_flexible, cpu); + + if (nr > slots->flexible) + slots->flexible = nr; + } +} + +/* + * Add a pinned breakpoint for the given task in our constraint table + */ +static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) +{ + int count = 0; + struct perf_event *bp; + struct perf_event_context *ctx = tsk->perf_event_ctxp; + unsigned int *tsk_pinned; + struct list_head *list; + unsigned long flags; + + if (WARN_ONCE(!ctx, "No perf context for this task")) + return; + + list = &ctx->event_list; + + spin_lock_irqsave(&ctx->lock, flags); + + /* + * The current breakpoint counter is not included in the list + * at the open() callback time + */ + list_for_each_entry(bp, list, event_entry) { + if (bp->attr.type == PERF_TYPE_BREAKPOINT) + count++; + } + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list")) + return; + + tsk_pinned = per_cpu(task_bp_pinned, cpu); + if (enable) { + tsk_pinned[count]++; + if (count > 0) + tsk_pinned[count-1]--; + } else { + tsk_pinned[count]--; + if (count > 0) + tsk_pinned[count-1]++; + } +} + +/* + * Add/remove the given breakpoint in our constraint table + */ +static void toggle_bp_slot(struct perf_event *bp, bool enable) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->ctx->task; + + /* Pinned counter task profiling */ + if (tsk) { + if (cpu >= 0) { + toggle_bp_task_slot(tsk, cpu, enable); + return; + } + + for_each_online_cpu(cpu) + toggle_bp_task_slot(tsk, cpu, enable); + return; + } + + /* Pinned counter cpu profiling */ + if (enable) + per_cpu(nr_cpu_bp_pinned, bp->cpu)++; + else + per_cpu(nr_cpu_bp_pinned, bp->cpu)--; +} + +/* + * Contraints to check before allowing this new breakpoint counter: + * + * == Non-pinned counter == (Considered as pinned for now) + * + * - If attached to a single cpu, check: + * + * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM + * + * -> If there are already non-pinned counters in this cpu, it means + * there is already a free slot for them. + * Otherwise, we check that the maximum number of per task + * breakpoints (for this cpu) plus the number of per cpu breakpoint + * (for this cpu) doesn't cover every registers. + * + * - If attached to every cpus, check: + * + * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM + * + * -> This is roughly the same, except we check the number of per cpu + * bp for every cpu and we keep the max one. Same for the per tasks + * breakpoints. + * + * + * == Pinned counter == + * + * - If attached to a single cpu, check: + * + * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM + * + * -> Same checks as before. But now the nr_bp_flexible, if any, must keep + * one register at least (or they will never be fed). + * + * - If attached to every cpus, check: + * + * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM + */ +int reserve_bp_slot(struct perf_event *bp) +{ + struct bp_busy_slots slots = {0}; + int ret = 0; + + mutex_lock(&nr_bp_mutex); + + fetch_bp_busy_slots(&slots, bp->cpu); + + /* Flexible counters need to keep at least one slot */ + if (slots.pinned + (!!slots.flexible) == HBP_NUM) { + ret = -ENOSPC; + goto end; + } + + toggle_bp_slot(bp, true); + +end: + mutex_unlock(&nr_bp_mutex); + + return ret; +} + +void release_bp_slot(struct perf_event *bp) +{ + mutex_lock(&nr_bp_mutex); + + toggle_bp_slot(bp, false); + + mutex_unlock(&nr_bp_mutex); +} + + +int __register_perf_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = reserve_bp_slot(bp); + if (ret) + return ret; + + /* + * Ptrace breakpoints can be temporary perf events only + * meant to reserve a slot. In this case, it is created disabled and + * we don't want to check the params right now (as we put a null addr) + * But perf tools create events as disabled and we want to check + * the params for them. + * This is a quick hack that will be removed soon, once we remove + * the tmp breakpoints from ptrace + */ + if (!bp->attr.disabled || bp->callback == perf_bp_event) + ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + + return ret; +} + +int register_perf_hw_breakpoint(struct perf_event *bp) +{ + bp->callback = perf_bp_event; + + return __register_perf_hw_breakpoint(bp); +} + +/** + * register_user_hw_breakpoint - register a hardware breakpoint for user space + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +struct perf_event * +register_user_hw_breakpoint(struct perf_event_attr *attr, + perf_callback_t triggered, + struct task_struct *tsk) +{ + return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); +} +EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); + +/** + * modify_user_hw_breakpoint - modify a user-space hardware breakpoint + * @bp: the breakpoint structure to modify + * @attr: new breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +struct perf_event * +modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr, + perf_callback_t triggered, + struct task_struct *tsk) +{ + /* + * FIXME: do it without unregistering + * - We don't want to lose our slot + * - If the new bp is incorrect, don't lose the older one + */ + unregister_hw_breakpoint(bp); + + return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); +} +EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); + +/** + * unregister_hw_breakpoint - unregister a user-space hardware breakpoint + * @bp: the breakpoint structure to unregister + */ +void unregister_hw_breakpoint(struct perf_event *bp) +{ + if (!bp) + return; + perf_event_release_kernel(bp); +} +EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); + +/** + * register_wide_hw_breakpoint - register a wide breakpoint in the kernel + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * + * @return a set of per_cpu pointers to perf events + */ +struct perf_event ** +register_wide_hw_breakpoint(struct perf_event_attr *attr, + perf_callback_t triggered) +{ + struct perf_event **cpu_events, **pevent, *bp; + long err; + int cpu; + + cpu_events = alloc_percpu(typeof(*cpu_events)); + if (!cpu_events) + return ERR_PTR(-ENOMEM); + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); + + *pevent = bp; + + if (IS_ERR(bp)) { + err = PTR_ERR(bp); + goto fail; + } + } + + return cpu_events; + +fail: + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + if (IS_ERR(*pevent)) + break; + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); + /* return the error if any */ + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); + +/** + * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel + * @cpu_events: the per cpu set of events to unregister + */ +void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) +{ + int cpu; + struct perf_event **pevent; + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); +} +EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); + +static struct notifier_block hw_breakpoint_exceptions_nb = { + .notifier_call = hw_breakpoint_exceptions_notify, + /* we need to be notified first */ + .priority = 0x7fffffff +}; + +static int __init init_hw_breakpoint(void) +{ + return register_die_notifier(&hw_breakpoint_exceptions_nb); +} +core_initcall(init_hw_breakpoint); + + +struct pmu perf_ops_bp = { + .enable = arch_install_hw_breakpoint, + .disable = arch_uninstall_hw_breakpoint, + .read = hw_breakpoint_pmu_read, + .unthrottle = hw_breakpoint_pmu_unthrottle +}; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8b6b8b697c6..8e5288a8a35 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } +EXPORT_SYMBOL_GPL(kallsyms_lookup_name); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 9147a3190c9..7d701463402 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -870,7 +870,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) /* * All threads that don't have debuggerinfo should be - * in __schedule() sleeping, since all other CPUs + * in schedule() sleeping, since all other CPUs * are in kgdb_wait, and thus have debuggerinfo. */ if (local_debuggerinfo) { diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1494e85b35f..e5342a344c4 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) */ static struct kprobe_blackpoint kprobe_blacklist[] = { {"preempt_schedule",}, + {"native_get_debugreg",}, + {"irq_entries_start",}, + {"common_interrupt",}, {NULL} /* Terminator */ }; @@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) return (kprobe_opcode_t *)(((char *)addr) + p->offset); } +/* Check passed kprobe is valid and return kprobe in kprobe_table. */ +static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) +{ + struct kprobe *old_p, *list_p; + + old_p = get_kprobe(p->addr); + if (unlikely(!old_p)) + return NULL; + + if (p != old_p) { + list_for_each_entry_rcu(list_p, &old_p->list, list) + if (list_p == p) + /* kprobe p is a valid probe */ + goto valid; + return NULL; + } +valid: + return old_p; +} + +/* Return error if the kprobe is being re-registered */ +static inline int check_kprobe_rereg(struct kprobe *p) +{ + int ret = 0; + struct kprobe *old_p; + + mutex_lock(&kprobe_mutex); + old_p = __get_valid_kprobe(p); + if (old_p) + ret = -EINVAL; + mutex_unlock(&kprobe_mutex); + return ret; +} + int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; @@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p) return -EINVAL; p->addr = addr; + ret = check_kprobe_rereg(p); + if (ret) + return ret; + preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) { @@ -754,26 +795,6 @@ out: } EXPORT_SYMBOL_GPL(register_kprobe); -/* Check passed kprobe is valid and return kprobe in kprobe_table. */ -static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) -{ - struct kprobe *old_p, *list_p; - - old_p = get_kprobe(p->addr); - if (unlikely(!old_p)) - return NULL; - - if (p != old_p) { - list_for_each_entry_rcu(list_p, &old_p->list, list) - if (list_p == p) - /* kprobe p is a valid probe */ - goto valid; - return NULL; - } -valid: - return old_p; -} - /* * Unregister a kprobe without a scheduler synchronization. */ @@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) arch_remove_kprobe(p); } +void __kprobes dump_kprobe(struct kprobe *kp) +{ + printk(KERN_WARNING "Dumping kprobe:\n"); + printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", + kp->symbol_name, kp->addr, kp->offset); +} + /* Module notifier call back, checking kprobes on the module */ static int __kprobes kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 9af56723c09..f5dcd36d315 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -49,7 +49,7 @@ #include "lockdep_internals.h" #define CREATE_TRACE_POINTS -#include <trace/events/lockdep.h> +#include <trace/events/lock.h> #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; diff --git a/kernel/notifier.c b/kernel/notifier.c index 61d5aa5eced..acd24e7643e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); static ATOMIC_NOTIFIER_HEAD(die_chain); -int notrace notify_die(enum die_val val, const char *str, +int notrace __kprobes notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 7f29643c898..6b7ddba1dd6 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -28,6 +28,8 @@ #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/perf_event.h> +#include <linux/ftrace_event.h> +#include <linux/hw_breakpoint.h> #include <asm/irq_regs.h> @@ -244,6 +246,49 @@ static void perf_unpin_context(struct perf_event_context *ctx) put_ctx(ctx); } +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + + if (ctx->is_active) + run_end = ctx->time; + else + run_end = event->tstamp_stopped; + + event->total_time_enabled = run_end - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = ctx->time; + + event->total_time_running = run_end - event->tstamp_running; +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -292,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader != event) event->group_leader->nr_siblings--; + update_event_times(event); + + /* + * If event was in error state, then keep it + * that way, otherwise bogus counts will be + * returned on read(). The only way to get out + * of error state is by explicit re-enabling + * of the event + */ + if (event->state > PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_OFF; + /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them @@ -445,50 +502,11 @@ retry: * can remove the event safely, if the call above did not * succeed. */ - if (!list_empty(&event->group_entry)) { + if (!list_empty(&event->group_entry)) list_del_event(event, ctx); - } spin_unlock_irq(&ctx->lock); } -static inline u64 perf_clock(void) -{ - return cpu_clock(smp_processor_id()); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_event_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - - event->total_time_enabled = ctx->time - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = ctx->time; - - event->total_time_running = run_end - event->tstamp_running; -} - /* * Update total_time_enabled and total_time_running for all events in a group. */ @@ -1031,10 +1049,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx, update_context_time(ctx); perf_disable(); - if (ctx->nr_active) + if (ctx->nr_active) { list_for_each_entry(event, &ctx->group_list, group_entry) group_sched_out(event, cpuctx, ctx); - + } perf_enable(); out: spin_unlock(&ctx->lock); @@ -1059,8 +1077,6 @@ static int context_equiv(struct perf_event_context *ctx1, && !ctx1->pin_count && !ctx2->pin_count; } -static void __perf_event_read(void *event); - static void __perf_event_sync_stat(struct perf_event *event, struct perf_event *next_event) { @@ -1078,8 +1094,8 @@ static void __perf_event_sync_stat(struct perf_event *event, */ switch (event->state) { case PERF_EVENT_STATE_ACTIVE: - __perf_event_read(event); - break; + event->pmu->read(event); + /* fall-through */ case PERF_EVENT_STATE_INACTIVE: update_event_times(event); @@ -1118,6 +1134,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, if (!ctx->nr_stat) return; + update_context_time(ctx); + event = list_first_entry(&ctx->event_list, struct perf_event, event_entry); @@ -1161,8 +1179,6 @@ void perf_event_task_sched_out(struct task_struct *task, if (likely(!ctx || !cpuctx->task_ctx)) return; - update_context_time(ctx); - rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_event_ctxp; @@ -1515,7 +1531,6 @@ static void __perf_event_read(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - unsigned long flags; /* * If this is a task context, we need to check whether it is @@ -1527,12 +1542,12 @@ static void __perf_event_read(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - local_irq_save(flags); - if (ctx->is_active) - update_context_time(ctx); - event->pmu->read(event); + spin_lock(&ctx->lock); + update_context_time(ctx); update_event_times(event); - local_irq_restore(flags); + spin_unlock(&ctx->lock); + + event->pmu->read(event); } static u64 perf_event_read(struct perf_event *event) @@ -1545,7 +1560,13 @@ static u64 perf_event_read(struct perf_event *event) smp_call_function_single(event->oncpu, __perf_event_read, event, 1); } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + update_context_time(ctx); update_event_times(event); + spin_unlock_irqrestore(&ctx->lock, flags); } return atomic64_read(&event->count); @@ -1658,6 +1679,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(err); } +static void perf_event_free_filter(struct perf_event *event); + static void free_event_rcu(struct rcu_head *head) { struct perf_event *event; @@ -1665,6 +1688,7 @@ static void free_event_rcu(struct rcu_head *head) event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); + perf_event_free_filter(event); kfree(event); } @@ -1696,16 +1720,10 @@ static void free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) +int perf_event_release_kernel(struct perf_event *event) { - struct perf_event *event = file->private_data; struct perf_event_context *ctx = event->ctx; - file->private_data = NULL; - WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_event_remove_from_context(event); @@ -1720,6 +1738,19 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +EXPORT_SYMBOL_GPL(perf_event_release_kernel); + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + + file->private_data = NULL; + + return perf_event_release_kernel(event); +} static int perf_event_read_size(struct perf_event *event) { @@ -1746,91 +1777,94 @@ static int perf_event_read_size(struct perf_event *event) return size; } -static u64 perf_event_read_value(struct perf_event *event) +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; + *enabled = 0; + *running = 0; + + mutex_lock(&event->child_mutex); total += perf_event_read(event); - list_for_each_entry(child, &event->child_list, child_list) + *enabled += event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + *running += event->total_time_running + + atomic64_read(&event->child_total_time_running); + + list_for_each_entry(child, &event->child_list, child_list) { total += perf_event_read(child); + *enabled += child->total_time_enabled; + *running += child->total_time_running; + } + mutex_unlock(&event->child_mutex); return total; } - -static int perf_event_read_entry(struct perf_event *event, - u64 read_format, char __user *buf) -{ - int n = 0, count = 0; - u64 values[2]; - - values[n++] = perf_event_read_value(event); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(event); - - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; -} +EXPORT_SYMBOL_GPL(perf_event_read_value); static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, err = -EFAULT; - u64 values[3]; + int n = 0, size = 0, ret = -EFAULT; + struct perf_event_context *ctx = leader->ctx; + u64 values[5]; + u64 count, enabled, running; + + mutex_lock(&ctx->mutex); + count = perf_event_read_value(leader, &enabled, &running); values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = leader->total_time_enabled + - atomic64_read(&leader->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = leader->total_time_running + - atomic64_read(&leader->child_total_time_running); - } + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + values[n++] = count; + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); size = n * sizeof(u64); if (copy_to_user(buf, values, size)) - return -EFAULT; - - err = perf_event_read_entry(leader, read_format, buf + size); - if (err < 0) - return err; + goto unlock; - size += err; + ret = size; list_for_each_entry(sub, &leader->sibling_list, group_entry) { - err = perf_event_read_entry(sub, read_format, - buf + size); - if (err < 0) - return err; + n = 0; + + values[n++] = perf_event_read_value(sub, &enabled, &running); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + size = n * sizeof(u64); - size += err; + if (copy_to_user(buf + ret, values, size)) { + ret = -EFAULT; + goto unlock; + } + + ret += size; } +unlock: + mutex_unlock(&ctx->mutex); - return size; + return ret; } static int perf_event_read_one(struct perf_event *event, u64 read_format, char __user *buf) { + u64 enabled, running; u64 values[4]; int n = 0; - values[n++] = perf_event_read_value(event); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = event->total_time_enabled + - atomic64_read(&event->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = event->total_time_running + - atomic64_read(&event->child_total_time_running); - } + values[n++] = perf_event_read_value(event, &enabled, &running); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); @@ -1861,12 +1895,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); - mutex_lock(&event->child_mutex); if (read_format & PERF_FORMAT_GROUP) ret = perf_event_read_group(event, read_format, buf); else ret = perf_event_read_one(event, read_format, buf); - mutex_unlock(&event->child_mutex); return ret; } @@ -1974,7 +2006,8 @@ unlock: return ret; } -int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_filter(struct perf_event *event, void __user *arg); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2002,6 +2035,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: return perf_event_set_output(event, arg); + case PERF_EVENT_IOC_SET_FILTER: + return perf_event_set_filter(event, (void __user *)arg); + default: return -ENOTTY; } @@ -2174,6 +2210,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data) perf_mmap_free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) perf_mmap_free_page((unsigned long)data->data_pages[i]); + kfree(data); } #else @@ -2214,6 +2251,7 @@ static void perf_mmap_data_free_work(struct work_struct *work) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); + kfree(data); } static void perf_mmap_data_free(struct perf_mmap_data *data) @@ -2307,7 +2345,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) } if (!data->watermark) - data->watermark = max_t(long, PAGE_SIZE, max_size / 2); + data->watermark = max_size / 2; rcu_assign_pointer(event->data, data); @@ -2319,7 +2357,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) data = container_of(rcu_head, struct perf_mmap_data, rcu_head); perf_mmap_data_free(data); - kfree(data); } static void perf_mmap_data_release(struct perf_event *event) @@ -2666,20 +2703,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle) static void perf_output_lock(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; - int cpu; + int cur, cpu = get_cpu(); handle->locked = 0; - local_irq_save(handle->flags); - cpu = smp_processor_id(); - - if (in_nmi() && atomic_read(&data->lock) == cpu) - return; + for (;;) { + cur = atomic_cmpxchg(&data->lock, -1, cpu); + if (cur == -1) { + handle->locked = 1; + break; + } + if (cur == cpu) + break; - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) cpu_relax(); - - handle->locked = 1; + } } static void perf_output_unlock(struct perf_output_handle *handle) @@ -2725,7 +2763,7 @@ again: if (atomic_xchg(&data->wakeup, 0)) perf_output_wakeup(handle); out: - local_irq_restore(handle->flags); + put_cpu(); } void perf_output_copy(struct perf_output_handle *handle, @@ -3236,15 +3274,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_task_match(event)) perf_event_task_output(event, task_event); } - rcu_read_unlock(); } static void perf_event_task_event(struct perf_task_event *task_event) @@ -3252,11 +3285,11 @@ static void perf_event_task_event(struct perf_task_event *task_event) struct perf_cpu_context *cpuctx; struct perf_event_context *ctx = task_event->task_ctx; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); if (!ctx) ctx = rcu_dereference(task_event->task->perf_event_ctxp); if (ctx) @@ -3348,15 +3381,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_comm_match(event)) perf_event_comm_output(event, comm_event); } - rcu_read_unlock(); } static void perf_event_comm_event(struct perf_comm_event *comm_event) @@ -3367,7 +3395,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) char comm[TASK_COMM_LEN]; memset(comm, 0, sizeof(comm)); - strncpy(comm, comm_event->task->comm, sizeof(comm)); + strlcpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -3375,11 +3403,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3472,15 +3500,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_mmap_match(event, mmap_event)) perf_event_mmap_output(event, mmap_event); } - rcu_read_unlock(); } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) @@ -3536,11 +3559,11 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3679,7 +3702,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, perf_event_disable(event); } - perf_event_output(event, nmi, data, regs); + if (event->overflow_handler) + event->overflow_handler(event, nmi, data, regs); + else + perf_event_output(event, nmi, data, regs); + return ret; } @@ -3724,16 +3751,16 @@ again: return nr; } -static void perf_swevent_overflow(struct perf_event *event, +static void perf_swevent_overflow(struct perf_event *event, u64 overflow, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; int throttle = 0; - u64 overflow; data->period = event->hw.last_period; - overflow = perf_swevent_set_period(event); + if (!overflow) + overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; @@ -3766,14 +3793,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, atomic64_add(nr, &event->count); + if (!regs) + return; + if (!hwc->sample_period) return; - if (!regs) + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) + return perf_swevent_overflow(event, 1, nmi, data, regs); + + if (atomic64_add_negative(nr, &hwc->period_left)) return; - if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swevent_overflow(event, nmi, data, regs); + perf_swevent_overflow(event, 0, nmi, data, regs); } static int perf_swevent_is_counting(struct perf_event *event) @@ -3806,25 +3838,44 @@ static int perf_swevent_is_counting(struct perf_event *event) return 1; } +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data); + +static int perf_exclude_event(struct perf_event *event, + struct pt_regs *regs) +{ + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 1; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 1; + } + + return 0; +} + static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, - u32 event_id, struct pt_regs *regs) + u32 event_id, + struct perf_sample_data *data, + struct pt_regs *regs) { if (!perf_swevent_is_counting(event)) return 0; if (event->attr.type != type) return 0; + if (event->attr.config != event_id) return 0; - if (regs) { - if (event->attr.exclude_user && user_mode(regs)) - return 0; + if (perf_exclude_event(event, regs)) + return 0; - if (event->attr.exclude_kernel && !user_mode(regs)) - return 0; - } + if (event->attr.type == PERF_TYPE_TRACEPOINT && + !perf_tp_event_match(event, data)) + return 0; return 1; } @@ -3837,49 +3888,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_swevent_match(event, type, event_id, regs)) + if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } - rcu_read_unlock(); } -static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +int perf_swevent_get_recursion_context(void) { + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int rctx; + if (in_nmi()) - return &cpuctx->recursion[3]; + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (cpuctx->recursion[rctx]) { + put_cpu_var(perf_cpu_context); + return -1; + } - if (in_irq()) - return &cpuctx->recursion[2]; + cpuctx->recursion[rctx]++; + barrier(); - if (in_softirq()) - return &cpuctx->recursion[1]; + return rctx; +} +EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); - return &cpuctx->recursion[0]; +void perf_swevent_put_recursion_context(int rctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + barrier(); + cpuctx->recursion[rctx]--; + put_cpu_var(perf_cpu_context); } +EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - if (*recursion) - goto out; - - (*recursion)++; - barrier(); - + cpuctx = &__get_cpu_var(perf_cpu_context); + rcu_read_lock(); perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, nr, nmi, data, regs); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3888,23 +3949,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, if (ctx) perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); rcu_read_unlock(); - - barrier(); - (*recursion)--; - -out: - put_cpu_var(perf_cpu_context); } void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - struct perf_sample_data data = { - .addr = addr, - }; + struct perf_sample_data data; + int rctx; - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, - &data, regs); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + return; + + data.addr = addr; + data.raw = NULL; + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); + + perf_swevent_put_recursion_context(rctx); } static void perf_swevent_read(struct perf_event *event) @@ -3949,6 +4011,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event->pmu->read(event); data.addr = 0; + data.period = event->hw.last_period; regs = get_irq_regs(); /* * In case we exclude kernel IPs or are somehow not in interrupt @@ -4108,6 +4171,7 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE + void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { @@ -4126,13 +4190,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, if (!regs) regs = task_pt_regs(current); + /* Trace events already protected against recursion */ do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data, regs); } EXPORT_SYMBOL_GPL(perf_tp_event); -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} static void tp_perf_event_destroy(struct perf_event *event) { @@ -4157,11 +4229,99 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) return &perf_ops_generic; } + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + + kfree(filter_str); + return ret; +} + +static void perf_event_free_filter(struct perf_event *event) +{ + ftrace_profile_free_filter(event); +} + #else + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + return 1; +} + static const struct pmu *tp_perf_event_init(struct perf_event *event) { return NULL; } + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + return -ENOENT; +} + +static void perf_event_free_filter(struct perf_event *event) +{ +} + +#endif /* CONFIG_EVENT_PROFILE */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + int err; + /* + * The breakpoint is already filled if we haven't created the counter + * through perf syscall + * FIXME: manage to get trigerred to NULL if it comes from syscalls + */ + if (!bp->callback) + err = register_perf_hw_breakpoint(bp); + else + err = __register_perf_hw_breakpoint(bp); + if (err) + return ERR_PTR(err); + + bp->destroy = bp_perf_event_destroy; + + return &perf_ops_bp; +} + +void perf_bp_event(struct perf_event *bp, void *data) +{ + struct perf_sample_data sample; + struct pt_regs *regs = data; + + sample.addr = bp->attr.bp_addr; + + if (!perf_exclude_event(bp, regs)) + perf_swevent_add(bp, 1, 1, &sample, regs); +} +#else +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + return NULL; +} + +void perf_bp_event(struct perf_event *bp, void *regs) +{ +} #endif atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; @@ -4208,6 +4368,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) case PERF_COUNT_SW_PAGE_FAULTS_MAJ: case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: + case PERF_COUNT_SW_ALIGNMENT_FAULTS: + case PERF_COUNT_SW_EMULATION_FAULTS: if (!event->parent) { atomic_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; @@ -4228,6 +4390,7 @@ perf_event_alloc(struct perf_event_attr *attr, struct perf_event_context *ctx, struct perf_event *group_leader, struct perf_event *parent_event, + perf_callback_t callback, gfp_t gfpflags) { const struct pmu *pmu; @@ -4270,6 +4433,11 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; + if (!callback && parent_event) + callback = parent_event->callback; + + event->callback = callback; + if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -4304,6 +4472,11 @@ perf_event_alloc(struct perf_event_attr *attr, pmu = tp_perf_event_init(event); break; + case PERF_TYPE_BREAKPOINT: + pmu = bp_perf_event_init(event); + break; + + default: break; } @@ -4416,7 +4589,7 @@ err_size: goto out; } -int perf_event_set_output(struct perf_event *event, int output_fd) +static int perf_event_set_output(struct perf_event *event, int output_fd) { struct perf_event *output_event = NULL; struct file *output_file = NULL; @@ -4546,7 +4719,7 @@ SYSCALL_DEFINE5(perf_event_open, } event = perf_event_alloc(&attr, cpu, ctx, group_leader, - NULL, GFP_KERNEL); + NULL, NULL, GFP_KERNEL); err = PTR_ERR(event); if (IS_ERR(event)) goto err_put_context; @@ -4594,6 +4767,60 @@ err_put_context: return err; } +/** + * perf_event_create_kernel_counter + * + * @attr: attributes of the counter to create + * @cpu: cpu in which the counter is bound + * @pid: task to profile + */ +struct perf_event * +perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, + pid_t pid, perf_callback_t callback) +{ + struct perf_event *event; + struct perf_event_context *ctx; + int err; + + /* + * Get the target context (task or percpu): + */ + + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_exit; + } + + event = perf_event_alloc(attr, cpu, ctx, NULL, + NULL, callback, GFP_KERNEL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_put_context; + } + + event->filp = NULL; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + event->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + + return event; + + err_put_context: + put_ctx(ctx); + err_exit: + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); + /* * inherit a event from parent task to child task: */ @@ -4619,7 +4846,7 @@ inherit_event(struct perf_event *parent_event, child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child_ctx, group_leader, parent_event, - GFP_KERNEL); + NULL, GFP_KERNEL); if (IS_ERR(child_event)) return child_event; get_ctx(child_ctx); @@ -4637,6 +4864,8 @@ inherit_event(struct perf_event *parent_event, if (parent_event->attr.freq) child_event->hw.sample_period = parent_event->hw.sample_period; + child_event->overflow_handler = parent_event->overflow_handler; + /* * Link it up in the child's context: */ @@ -4726,7 +4955,6 @@ __perf_event_exit_task(struct perf_event *child_event, { struct perf_event *parent_event; - update_event_times(child_event); perf_event_remove_from_context(child_event); parent_event = child_event->parent; @@ -4778,6 +5006,7 @@ void perf_event_exit_task(struct task_struct *child) * the events from it. */ unclone_ctx(child_ctx); + update_context_time(child_ctx); spin_unlock_irqrestore(&child_ctx->lock, flags); /* diff --git a/kernel/sched.c b/kernel/sched.c index 6ae2739b8f1..e7f2cfa6a25 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -535,14 +535,12 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ - unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; - u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -591,6 +589,8 @@ struct rq { u64 rt_avg; u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; #endif /* calc_load related fields */ @@ -772,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, if (!sched_feat_names[i]) return -EINVAL; - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2017,6 +2017,7 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) } spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; @@ -2078,7 +2079,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif if (old_cpu != new_cpu) { p->se.nr_migrations++; - new_rq->nr_migrations_in++; #ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); @@ -2115,6 +2115,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * it is sufficient to simply update the task's cpu field. */ if (!p->se.on_rq && !task_running(rq, p)) { + update_rq_clock(rq); set_task_cpu(p, dest_cpu); return 0; } @@ -2376,14 +2377,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, task_rq_unlock(rq, &flags); cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) + if (cpu != orig_cpu) { + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); set_task_cpu(p, cpu); - + local_irq_restore(flags); + } rq = task_rq_lock(p, &flags); - if (rq != orig_rq) - update_rq_clock(rq); - WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2440,6 +2442,17 @@ out_running: #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } #endif out: task_rq_unlock(rq, &flags); @@ -2545,6 +2558,7 @@ static void __sched_fork(struct task_struct *p) void sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); + unsigned long flags; __sched_fork(p); @@ -2581,7 +2595,10 @@ void sched_fork(struct task_struct *p, int clone_flags) #ifdef CONFIG_SMP cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); #endif + local_irq_save(flags); + update_rq_clock(cpu_rq(cpu)); set_task_cpu(p, cpu); + local_irq_restore(flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2848,14 +2865,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (unlikely(!mm)) { + if (likely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (likely(!prev->mm)) { prev->active_mm = NULL; rq->prev_mm = oldmm; } @@ -3018,15 +3035,6 @@ static void calc_load_account_active(struct rq *this_rq) } /* - * Externally visible per-cpu scheduler statistics: - * cpu_nr_migrations(cpu) - number of migrations into that cpu - */ -u64 cpu_nr_migrations(int cpu) -{ - return cpu_rq(cpu)->nr_migrations_in; -} - -/* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). */ @@ -4126,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_online_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4289,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_online_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4429,6 +4437,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; + this_rq->idle_stamp = this_rq->clock; + + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4443,8 +4456,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + if (pulled_task) { + this_rq->idle_stamp = 0; break; + } } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* @@ -5046,8 +5061,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime = cputime_add(p->gtime, cputime); /* Add guest time to cpustat. */ - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); + if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); + cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); + } else { + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); + } } /* @@ -5162,60 +5182,86 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -cputime_t task_utime(struct task_struct *p) +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - return p->utime; + *ut = p->utime; + *st = p->stime; } -cputime_t task_stime(struct task_struct *p) +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - return p->stime; + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; } #else -cputime_t task_utime(struct task_struct *p) + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) +#endif + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); - u64 temp; + cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); /* * Use CFS's precise accounting: */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - temp *= utime; + u64 temp; + + temp = (u64)(rtime * utime); do_div(temp, total); - } - utime = (clock_t)temp; + utime = (cputime_t)temp; + } else + utime = rtime; + + /* + * Compare with previous values, to keep monotonicity: + */ + p->prev_utime = max(p->prev_utime, utime); + p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); - return p->prev_utime; + *ut = p->prev_utime; + *st = p->prev_stime; } -cputime_t task_stime(struct task_struct *p) +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - clock_t stime; + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); + thread_group_cputime(p, &cputime); - if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + total = cputime_add(cputime.utime, cputime.stime); + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - return p->prev_stime; -} -#endif + if (total) { + u64 temp; -inline cputime_t task_gtime(struct task_struct *p) -{ - return p->gtime; + temp = (u64)(rtime * cputime.utime); + do_div(temp, total); + utime = (cputime_t)temp; + } else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, + cputime_sub(rtime, sig->prev_utime)); + + *ut = sig->prev_utime; + *st = sig->prev_stime; } +#endif /* * This function gets called by the timer code, with HZ frequency. @@ -6175,22 +6221,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) BUG_ON(p->se.on_rq); p->policy = policy; - switch (p->policy) { - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - p->sched_class = &fair_sched_class; - break; - case SCHED_FIFO: - case SCHED_RR: - p->sched_class = &rt_sched_class; - break; - } - p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; set_load_weight(p); } @@ -6935,7 +6973,7 @@ void show_state_filter(unsigned long state_filter) /* * Only show locks if all tasks are dumped: */ - if (state_filter == -1) + if (!state_filter) debug_show_all_locks(); } @@ -7406,17 +7444,16 @@ static struct ctl_table sd_ctl_dir[] = { .procname = "sched_domain", .mode = 0555, }, - {0, }, + {} }; static struct ctl_table sd_ctl_root[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, .child = sd_ctl_dir, }, - {0, }, + {} }; static struct ctl_table *sd_alloc_ctl_entry(int n) @@ -7740,6 +7777,16 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG +static __read_mostly int sched_domain_debug_enabled; + +static int __init sched_domain_debug_setup(char *str) +{ + sched_domain_debug_enabled = 1; + + return 0; +} +early_param("sched_debug", sched_domain_debug_setup); + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { @@ -7826,6 +7873,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) cpumask_var_t groupmask; int level = 0; + if (!sched_domain_debug_enabled) + return; + if (!sd) { printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); return; @@ -7905,6 +7955,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + synchronize_sched(); + cpupri_cleanup(&rd->cpupri); free_cpumask_var(rd->rto_mask); @@ -8045,6 +8097,7 @@ static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + alloc_bootmem_cpumask_var(&cpu_isolated_map); cpulist_parse(str, cpu_isolated_map); return 1; } @@ -8881,7 +8934,7 @@ static int build_sched_domains(const struct cpumask *cpu_map) return __build_sched_domains(cpu_map, NULL); } -static struct cpumask *doms_cur; /* current sched domains */ +static cpumask_var_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ @@ -8903,6 +8956,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void) return 0; } +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ + int i; + cpumask_var_t *doms; + + doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + if (!doms) + return NULL; + for (i = 0; i < ndoms; i++) { + if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { + free_sched_domains(doms, i); + return NULL; + } + } + return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ + unsigned int i; + for (i = 0; i < ndoms; i++) + free_cpumask_var(doms[i]); + kfree(doms); +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -8914,12 +8992,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); + doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) - doms_cur = fallback_doms; - cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); + doms_cur = &fallback_doms; + cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur); + err = build_sched_domains(doms_cur[0]); register_sched_domain_sysctl(); return err; @@ -8969,19 +9047,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the * current 'doms_cur' domains and in the new 'doms_new', we can leave * it as it is. * - * The passed in 'doms_new' should be kmalloc'd. This routine takes - * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL && - * ndoms_new == 1, and partition_sched_domains() will fallback to - * the single partition 'fallback_doms', it also forces the domains - * to be rebuilt. + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains. This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. * * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, @@ -8989,8 +9067,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock held */ -/* FIXME: Change to struct cpumask *doms_new[] */ -void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { int i, j, n; @@ -9009,40 +9086,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(&doms_cur[i], &doms_new[j]) + if (cpumask_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur + i); + detach_destroy_domains(doms_cur[i]); match1: ; } if (doms_new == NULL) { ndoms_cur = 0; - doms_new = fallback_doms; - cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); + doms_new = &fallback_doms; + cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpumask_equal(&doms_new[i], &doms_cur[j]) + if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new + i, + __build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } /* Remember the new sched domains */ - if (doms_cur != fallback_doms) - kfree(doms_cur); + if (doms_cur != &fallback_doms) + free_sched_domains(doms_cur, ndoms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; dattr_cur = dattr_new; @@ -9364,10 +9441,6 @@ void __init sched_init(void) #ifdef CONFIG_CPUMASK_OFFSTACK alloc_size += num_possible_cpus() * cpumask_size(); #endif - /* - * As sched_init() is called before page_alloc is setup, - * we use alloc_bootmem(). - */ if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); @@ -9522,6 +9595,8 @@ void __init sched_init(void) rq->cpu = i; rq->online = 0; rq->migration_thread = NULL; + rq->idle_stamp = 0; + rq->avg_idle = 2*sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); #endif @@ -9571,7 +9646,9 @@ void __init sched_init(void) zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + /* May be allocated at isolcpus cmdline parse time */ + if (cpu_isolated_map == NULL) + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ perf_event_init(); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index efb84409bc4..6988cf08f70 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu) #ifdef CONFIG_SCHEDSTATS #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P(yld_count); P(sched_switch); P(sched_count); P(sched_goidle); +#ifdef CONFIG_SMP + P64(avg_idle); +#endif P(ttwu_count); P(ttwu_local); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 37087a7fac2..f61837ad336 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1345,6 +1345,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } /* + * Try and locate an idle CPU in the sched_domain. + */ +static int +select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int i; + + /* + * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE + * test in select_task_rq_fair) and the prev_cpu is idle then that's + * always a better target than the current cpu. + */ + if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) + return prev_cpu; + + /* + * Otherwise, iterate the domain and find an elegible idle cpu. + */ + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (!cpu_rq(i)->cfs.nr_running) { + target = i; + break; + } + } + + return target; +} + +/* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and * SD_BALANCE_EXEC. @@ -1398,11 +1429,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag want_sd = 0; } - if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && - cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + /* + * While iterating the domains looking for a spanning + * WAKE_AFFINE domain, adjust the affine target to any idle cpu + * in cache sharing domains along the way. + */ + if (want_affine) { + int target = -1; - affine_sd = tmp; - want_affine = 0; + /* + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. + */ + if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) + target = cpu; + + /* + * If there's an idle sibling in this domain, make that + * the wake_affine target instead of the current cpu. + */ + if (tmp->flags & SD_PREFER_SIBLING) + target = select_idle_sibling(p, tmp, target); + + if (target >= 0) { + if (tmp->flags & SD_WAKE_AFFINE) { + affine_sd = tmp; + want_affine = 0; + } + cpu = target; + } } if (!want_sd && !want_affine) @@ -1679,7 +1734,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; - if (unlikely(!cfs_rq->nr_running)) + if (!cfs_rq->nr_running) return NULL; do { diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index a4d790cddb1..5c5fef37841 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); -static inline int pick_optimal_cpu(int this_cpu, - const struct cpumask *mask) -{ - int first; - - /* "this_cpu" is cheaper to preempt than a remote processor */ - if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask)) - return this_cpu; - - first = cpumask_first(mask); - if (first < nr_cpu_ids) - return first; - - return -1; -} - static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); - cpumask_var_t domain_mask; if (task->rt.nr_cpus_allowed == 1) return -1; /* No other targets possible */ @@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task) * Otherwise, we consult the sched_domains span maps to figure * out which cpu is logically closest to our hot cache data. */ - if (this_cpu == cpu) - this_cpu = -1; /* Skip this_cpu opt if the same */ - - if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) { - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - int best_cpu; + if (!cpumask_test_cpu(this_cpu, lowest_mask)) + this_cpu = -1; /* Skip this_cpu opt if not among lowest */ - cpumask_and(domain_mask, - sched_domain_span(sd), - lowest_mask); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; - best_cpu = pick_optimal_cpu(this_cpu, - domain_mask); - - if (best_cpu != -1) { - free_cpumask_var(domain_mask); - return best_cpu; - } - } + /* + * "this_cpu" is cheaper to preempt than a + * remote processor. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) + return this_cpu; + + best_cpu = cpumask_first_and(lowest_mask, + sched_domain_span(sd)); + if (best_cpu < nr_cpu_ids) + return best_cpu; } - free_cpumask_var(domain_mask); } /* @@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task) * just give the caller *something* to work with from the compatible * locations. */ - return pick_optimal_cpu(this_cpu, lowest_mask); + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(lowest_mask); + if (cpu < nr_cpu_ids) + return cpu; + return -1; } /* Will lock the rq it finds */ diff --git a/kernel/signal.c b/kernel/signal.c index fe08008133d..6b982f2cf52 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -28,7 +28,8 @@ #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> -#include <trace/events/sched.h> +#define CREATE_TRACE_POINTS +#include <trace/events/signal.h> #include <asm/param.h> #include <asm/uaccess.h> @@ -856,7 +857,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigqueue *q; int override_rlimit; - trace_sched_signal_send(sig, t); + trace_signal_generate(sig, info, t); assert_spin_locked(&t->sighand->siglock); @@ -918,12 +919,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, break; } } else if (!is_si_special(info)) { - if (sig >= SIGRTMIN && info->si_code != SI_USER) - /* - * Queue overflow, abort. We may abort if the signal was rt - * and sent by user using something other than kill(). - */ + if (sig >= SIGRTMIN && info->si_code != SI_USER) { + /* + * Queue overflow, abort. We may abort if the + * signal was rt and sent by user using something + * other than kill(). + */ + trace_signal_overflow_fail(sig, group, info); return -EAGAIN; + } else { + /* + * This is a silent loss of information. We still + * send the signal, but the *info bits are lost. + */ + trace_signal_lose_info(sig, group, info); + } } out_set: @@ -1859,6 +1869,9 @@ relock: ka = &sighand->action[signr-1]; } + /* Trace actually delivered signals. */ + trace_signal_deliver(signr, info, ka); + if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 00889bd3c59..7494bbf5a27 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -49,7 +49,6 @@ static const int slow_work_max_vslow = 99; ctl_table slow_work_sysctls[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "min-threads", .data = &slow_work_min_threads, .maxlen = sizeof(unsigned), @@ -59,7 +58,6 @@ ctl_table slow_work_sysctls[] = { .extra2 = &slow_work_max_threads, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "max-threads", .data = &slow_work_max_threads, .maxlen = sizeof(unsigned), @@ -69,16 +67,15 @@ ctl_table slow_work_sysctls[] = { .extra2 = (void *) &slow_work_max_max_threads, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "vslow-percentage", .data = &vslow_work_proportion, .maxlen = sizeof(unsigned), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = (void *) &slow_work_min_vslow, .extra2 = (void *) &slow_work_max_vslow, }, - { .ctl_name = 0 } + {} }; #endif diff --git a/kernel/sys.c b/kernel/sys.c index ce17760d9c5..9968c5fb55b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -911,16 +911,15 @@ change_okay: void do_sys_times(struct tms *tms) { - struct task_cputime cputime; - cputime_t cutime, cstime; + cputime_t tgutime, tgstime, cutime, cstime; - thread_group_cputime(current, &cputime); spin_lock_irq(¤t->sighand->siglock); + thread_group_times(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); - tms->tms_utime = cputime_to_clock_t(cputime.utime); - tms->tms_stime = cputime_to_clock_t(cputime.stime); + tms->tms_utime = cputime_to_clock_t(tgutime); + tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); tms->tms_cstime = cputime_to_clock_t(cstime); } @@ -1338,16 +1337,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; - cputime_t utime, stime; - struct task_cputime cputime; + cputime_t tgutime, tgstime, utime, stime; unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { - utime = task_utime(current); - stime = task_stime(current); + task_times(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = p->signal->maxrss; goto out; @@ -1373,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - thread_group_cputime(p, &cputime); - utime = cputime_add(utime, cputime.utime); - stime = cputime_add(stime, cputime.stime); + thread_group_times(p, &tgutime, &tgstime); + utime = cputime_add(utime, tgutime); + stime = cputime_add(stime, tgstime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index f050ba85d42..695384f12a7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -141,7 +141,6 @@ cond_syscall(sys_pciconfig_read); cond_syscall(sys_pciconfig_write); cond_syscall(sys_pciconfig_iobase); cond_syscall(sys32_ipc); -cond_syscall(sys32_sysctl); cond_syscall(ppc_rtas); cond_syscall(sys_spu_run); cond_syscall(sys_spu_create); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dbf93a52ee..9327a26765c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -27,7 +27,6 @@ #include <linux/security.h> #include <linux/ctype.h> #include <linux/kmemcheck.h> -#include <linux/smp_lock.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> @@ -61,7 +60,6 @@ #include <asm/io.h> #endif -static int deprecated_sysctl_warning(struct __sysctl_args *args); #if defined(CONFIG_SYSCTL) @@ -210,31 +208,26 @@ extern int lock_stat; static struct ctl_table root_table[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, .child = kern_table, }, { - .ctl_name = CTL_VM, .procname = "vm", .mode = 0555, .child = vm_table, }, { - .ctl_name = CTL_FS, .procname = "fs", .mode = 0555, .child = fs_table, }, { - .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, .child = debug_table, }, { - .ctl_name = CTL_DEV, .procname = "dev", .mode = 0555, .child = dev_table, @@ -243,7 +236,7 @@ static struct ctl_table root_table[] = { * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; #ifdef CONFIG_SCHED_DEBUG @@ -255,192 +248,166 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ static struct ctl_table kern_table[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_SCHED_DEBUG { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_min_granularity_ns", .data = &sysctl_sched_min_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, - .strategy = &sysctl_intvec, + .proc_handler = sched_nr_latency_handler, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_latency_ns", .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, - .strategy = &sysctl_intvec, + .proc_handler = sched_nr_latency_handler, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_wakeup_granularity_ns", .data = &sysctl_sched_wakeup_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_shares_ratelimit", .data = &sysctl_sched_shares_ratelimit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_shares_thresh", .data = &sysctl_sched_shares_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_features", .data = &sysctl_sched_features, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_migration_cost", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_nr_migrate", .data = &sysctl_sched_nr_migrate, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_time_avg", .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "timer_migration", .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_rt_handler, + .proc_handler = sched_rt_handler, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_rt_runtime_us", .data = &sysctl_sched_rt_runtime, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &sched_rt_handler, + .proc_handler = sched_rt_handler, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_compat_yield", .data = &sysctl_sched_compat_yield, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_PROVE_LOCKING { - .ctl_name = CTL_UNNUMBERED, .procname = "prove_locking", .data = &prove_locking, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_LOCK_STAT { - .ctl_name = CTL_UNNUMBERED, .procname = "lock_stat", .data = &lock_stat, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_PANIC, .procname = "panic", .data = &panic_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", .data = &core_uses_pid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", .data = core_pattern, .maxlen = CORENAME_MAX_SIZE, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "core_pipe_limit", .data = &core_pipe_limit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", .maxlen = sizeof(long), .mode = 0644, - .proc_handler = &proc_taint, + .proc_handler = proc_taint, }, #endif #ifdef CONFIG_LATENCYTOP @@ -449,181 +416,160 @@ static struct ctl_table kern_table[] = { .data = &latencytop_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_BLK_DEV_INITRD { - .ctl_name = KERN_REALROOTDEV, .procname = "real-root-dev", .data = &real_root_dev, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "print-fatal-signals", .data = &print_fatal_signals, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_SPARC { - .ctl_name = KERN_SPARC_REBOOT, .procname = "reboot-cmd", .data = reboot_command, .maxlen = 256, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, { - .ctl_name = KERN_SPARC_STOP_A, .procname = "stop-a", .data = &stop_a_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_SPARC_SCONS_PWROFF, .procname = "scons-poweroff", .data = &scons_pwroff, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_SPARC64 { - .ctl_name = CTL_UNNUMBERED, .procname = "tsb-ratio", .data = &sysctl_tsb_ratio, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef __hppa__ { - .ctl_name = KERN_HPPA_PWRSW, .procname = "soft-power", .data = &pwrsw_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_HPPA_UNALIGNED, .procname = "unaligned-trap", .data = &unaligned_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_CTLALTDEL, .procname = "ctrl-alt-del", .data = &C_A_D, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_FUNCTION_TRACER { - .ctl_name = CTL_UNNUMBERED, .procname = "ftrace_enabled", .data = &ftrace_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &ftrace_enable_sysctl, + .proc_handler = ftrace_enable_sysctl, }, #endif #ifdef CONFIG_STACK_TRACER { - .ctl_name = CTL_UNNUMBERED, .procname = "stack_tracer_enabled", .data = &stack_tracer_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &stack_trace_sysctl, + .proc_handler = stack_trace_sysctl, }, #endif #ifdef CONFIG_TRACING { - .ctl_name = CTL_UNNUMBERED, .procname = "ftrace_dump_on_oops", .data = &ftrace_dump_on_oops, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_MODULES { - .ctl_name = KERN_MODPROBE, .procname = "modprobe", .data = &modprobe_path, .maxlen = KMOD_PATH_LEN, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "modules_disabled", .data = &modules_disabled, .maxlen = sizeof(int), .mode = 0644, /* only handle a transition from default "0" to "1" */ - .proc_handler = &proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &one, .extra2 = &one, }, #endif #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) { - .ctl_name = KERN_HOTPLUG, .procname = "hotplug", .data = &uevent_helper, .maxlen = UEVENT_HELPER_PATH_LEN, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, #endif #ifdef CONFIG_CHR_DEV_SG { - .ctl_name = KERN_SG_BIG_BUFF, .procname = "sg-big-buff", .data = &sg_big_buff, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_BSD_PROCESS_ACCT { - .ctl_name = KERN_ACCT, .procname = "acct", .data = &acct_parm, .maxlen = 3*sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_MAGIC_SYSRQ { - .ctl_name = KERN_SYSRQ, .procname = "sysrq", .data = &__sysrq_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_PROC_SYSCTL @@ -632,215 +578,188 @@ static struct ctl_table kern_table[] = { .data = NULL, .maxlen = sizeof (int), .mode = 0600, - .proc_handler = &proc_do_cad_pid, + .proc_handler = proc_do_cad_pid, }, #endif { - .ctl_name = KERN_MAX_THREADS, .procname = "threads-max", .data = &max_threads, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_RANDOM, .procname = "random", .mode = 0555, .child = random_table, }, { - .ctl_name = KERN_OVERFLOWUID, .procname = "overflowuid", .data = &overflowuid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, { - .ctl_name = KERN_OVERFLOWGID, .procname = "overflowgid", .data = &overflowgid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, #ifdef CONFIG_S390 #ifdef CONFIG_MATHEMU { - .ctl_name = KERN_IEEE_EMULATION_WARNINGS, .procname = "ieee_emulation_warnings", .data = &sysctl_ieee_emulation_warnings, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_S390_USER_DEBUG_LOGGING, .procname = "userprocess_debug", .data = &sysctl_userprocess_debug, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_PIDMAX, .procname = "pid_max", .data = &pid_max, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, { - .ctl_name = KERN_PANIC_ON_OOPS, .procname = "panic_on_oops", .data = &panic_on_oops, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #if defined CONFIG_PRINTK { - .ctl_name = KERN_PRINTK, .procname = "printk", .data = &console_loglevel, .maxlen = 4*sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_PRINTK_RATELIMIT, .procname = "printk_ratelimit", .data = &printk_ratelimit_state.interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = KERN_PRINTK_RATELIMIT_BURST, .procname = "printk_ratelimit_burst", .data = &printk_ratelimit_state.burst, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "printk_delay", .data = &printk_delay_msec, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &ten_thousand, }, #endif { - .ctl_name = KERN_NGROUPS_MAX, .procname = "ngroups_max", .data = &ngroups_max, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { - .ctl_name = KERN_UNKNOWN_NMI_PANIC, .procname = "unknown_nmi_panic", .data = &unknown_nmi_panic, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .procname = "nmi_watchdog", .data = &nmi_watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_nmi_enabled, + .proc_handler = proc_nmi_enabled, }, #endif #if defined(CONFIG_X86) { - .ctl_name = KERN_PANIC_ON_NMI, .procname = "panic_on_unrecovered_nmi", .data = &panic_on_unrecovered_nmi, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "panic_on_io_nmi", .data = &panic_on_io_nmi, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", .data = &bootloader_type, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "bootloader_version", .data = &bootloader_version, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "kstack_depth_to_print", .data = &kstack_depth_to_print, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "io_delay_type", .data = &io_delay_type, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #if defined(CONFIG_MMU) { - .ctl_name = KERN_RANDOMIZE, .procname = "randomize_va_space", .data = &randomize_va_space, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) { - .ctl_name = KERN_SPIN_RETRY, .procname = "spin_retry", .data = &spin_retry, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) @@ -849,123 +768,104 @@ static struct ctl_table kern_table[] = { .data = &acpi_realmode_flags, .maxlen = sizeof (unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, #endif #ifdef CONFIG_IA64 { - .ctl_name = KERN_IA64_UNALIGNED, .procname = "ignore-unaligned-usertrap", .data = &no_unaligned_warning, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "unaligned-dump-stack", .data = &unaligned_dump_stack, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_DETECT_SOFTLOCKUP { - .ctl_name = CTL_UNNUMBERED, .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "softlockup_thresh", .data = &softlockup_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dosoftlockup_thresh, - .strategy = &sysctl_intvec, + .proc_handler = proc_dosoftlockup_thresh, .extra1 = &neg_one, .extra2 = &sixty, }, #endif #ifdef CONFIG_DETECT_HUNG_TASK { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_check_count", .data = &sysctl_hung_task_check_count, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_doulongvec_minmax, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_timeout_secs", .data = &sysctl_hung_task_timeout_secs, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_dohung_task_timeout_secs, - .strategy = &sysctl_intvec, + .proc_handler = proc_dohung_task_timeout_secs, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_doulongvec_minmax, }, #endif #ifdef CONFIG_COMPAT { - .ctl_name = KERN_COMPAT_LOG, .procname = "compat-log", .data = &compat_log, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_RT_MUTEXES { - .ctl_name = KERN_MAX_LOCK_DEPTH, .procname = "max_lock_depth", .data = &max_lock_depth, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "poweroff_cmd", .data = &poweroff_cmd, .maxlen = POWEROFF_CMD_PATH_LEN, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, #ifdef CONFIG_KEYS { - .ctl_name = CTL_UNNUMBERED, .procname = "keys", .mode = 0555, .child = key_sysctls, @@ -973,17 +873,15 @@ static struct ctl_table kern_table[] = { #endif #ifdef CONFIG_RCU_TORTURE_TEST { - .ctl_name = CTL_UNNUMBERED, .procname = "rcutorture_runnable", .data = &rcutorture_runnable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_SLOW_WORK { - .ctl_name = CTL_UNNUMBERED, .procname = "slow-work", .mode = 0555, .child = slow_work_sysctls, @@ -991,146 +889,127 @@ static struct ctl_table kern_table[] = { #endif #ifdef CONFIG_PERF_EVENTS { - .ctl_name = CTL_UNNUMBERED, .procname = "perf_event_paranoid", .data = &sysctl_perf_event_paranoid, .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "perf_event_mlock_kb", .data = &sysctl_perf_event_mlock, .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "perf_event_max_sample_rate", .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_KMEMCHECK { - .ctl_name = CTL_UNNUMBERED, .procname = "kmemcheck", .data = &kmemcheck_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_BLOCK { - .ctl_name = CTL_UNNUMBERED, .procname = "blk_iopoll", .data = &blk_iopoll_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; static struct ctl_table vm_table[] = { { - .ctl_name = VM_OVERCOMMIT_MEMORY, .procname = "overcommit_memory", .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_PANIC_ON_OOM, .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "oom_kill_allocating_task", .data = &sysctl_oom_kill_allocating_task, .maxlen = sizeof(sysctl_oom_kill_allocating_task), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "oom_dump_tasks", .data = &sysctl_oom_dump_tasks, .maxlen = sizeof(sysctl_oom_dump_tasks), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_OVERCOMMIT_RATIO, .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_PAGE_CLUSTER, .procname = "page-cluster", .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_DIRTY_BACKGROUND, .procname = "dirty_background_ratio", .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, - .proc_handler = &dirty_background_ratio_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_background_ratio_handler, .extra1 = &zero, .extra2 = &one_hundred, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "dirty_background_bytes", .data = &dirty_background_bytes, .maxlen = sizeof(dirty_background_bytes), .mode = 0644, - .proc_handler = &dirty_background_bytes_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_background_bytes_handler, .extra1 = &one_ul, }, { - .ctl_name = VM_DIRTY_RATIO, .procname = "dirty_ratio", .data = &vm_dirty_ratio, .maxlen = sizeof(vm_dirty_ratio), .mode = 0644, - .proc_handler = &dirty_ratio_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_ratio_handler, .extra1 = &zero, .extra2 = &one_hundred, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "dirty_bytes", .data = &vm_dirty_bytes, .maxlen = sizeof(vm_dirty_bytes), .mode = 0644, - .proc_handler = &dirty_bytes_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_bytes_handler, .extra1 = &dirty_bytes_min, }, { @@ -1138,31 +1017,28 @@ static struct ctl_table vm_table[] = { .data = &dirty_writeback_interval, .maxlen = sizeof(dirty_writeback_interval), .mode = 0644, - .proc_handler = &dirty_writeback_centisecs_handler, + .proc_handler = dirty_writeback_centisecs_handler, }, { .procname = "dirty_expire_centisecs", .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_NR_PDFLUSH_THREADS, .procname = "nr_pdflush_threads", .data = &nr_pdflush_threads, .maxlen = sizeof nr_pdflush_threads, .mode = 0444 /* read-only*/, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_SWAPPINESS, .procname = "swappiness", .data = &vm_swappiness, .maxlen = sizeof(vm_swappiness), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one_hundred, }, @@ -1172,255 +1048,213 @@ static struct ctl_table vm_table[] = { .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &hugetlb_sysctl_handler, + .proc_handler = hugetlb_sysctl_handler, .extra1 = (void *)&hugetlb_zero, .extra2 = (void *)&hugetlb_infinity, }, { - .ctl_name = VM_HUGETLB_GROUP, .procname = "hugetlb_shm_group", .data = &sysctl_hugetlb_shm_group, .maxlen = sizeof(gid_t), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hugepages_treat_as_movable", .data = &hugepages_treat_as_movable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &hugetlb_treat_movable_handler, + .proc_handler = hugetlb_treat_movable_handler, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "nr_overcommit_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &hugetlb_overcommit_handler, + .proc_handler = hugetlb_overcommit_handler, .extra1 = (void *)&hugetlb_zero, .extra2 = (void *)&hugetlb_infinity, }, #endif { - .ctl_name = VM_LOWMEM_RESERVE_RATIO, .procname = "lowmem_reserve_ratio", .data = &sysctl_lowmem_reserve_ratio, .maxlen = sizeof(sysctl_lowmem_reserve_ratio), .mode = 0644, - .proc_handler = &lowmem_reserve_ratio_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, }, { - .ctl_name = VM_DROP_PAGECACHE, .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, - .strategy = &sysctl_intvec, }, { - .ctl_name = VM_MIN_FREE_KBYTES, .procname = "min_free_kbytes", .data = &min_free_kbytes, .maxlen = sizeof(min_free_kbytes), .mode = 0644, - .proc_handler = &min_free_kbytes_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = min_free_kbytes_sysctl_handler, .extra1 = &zero, }, { - .ctl_name = VM_PERCPU_PAGELIST_FRACTION, .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, - .proc_handler = &percpu_pagelist_fraction_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = percpu_pagelist_fraction_sysctl_handler, .extra1 = &min_percpu_pagelist_fract, }, #ifdef CONFIG_MMU { - .ctl_name = VM_MAX_MAP_COUNT, .procname = "max_map_count", .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = proc_dointvec }, #else { - .ctl_name = CTL_UNNUMBERED, .procname = "nr_trim_pages", .data = &sysctl_nr_trim_pages, .maxlen = sizeof(sysctl_nr_trim_pages), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, #endif { - .ctl_name = VM_LAPTOP_MODE, .procname = "laptop_mode", .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = VM_BLOCK_DUMP, .procname = "block_dump", .data = &block_dump, .maxlen = sizeof(block_dump), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, { - .ctl_name = VM_VFS_CACHE_PRESSURE, .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, .maxlen = sizeof(sysctl_vfs_cache_pressure), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT { - .ctl_name = VM_LEGACY_VA_LAYOUT, .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, #endif #ifdef CONFIG_NUMA { - .ctl_name = VM_ZONE_RECLAIM_MODE, .procname = "zone_reclaim_mode", .data = &zone_reclaim_mode, .maxlen = sizeof(zone_reclaim_mode), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, { - .ctl_name = VM_MIN_UNMAPPED, .procname = "min_unmapped_ratio", .data = &sysctl_min_unmapped_ratio, .maxlen = sizeof(sysctl_min_unmapped_ratio), .mode = 0644, - .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, .extra1 = &zero, .extra2 = &one_hundred, }, { - .ctl_name = VM_MIN_SLAB, .procname = "min_slab_ratio", .data = &sysctl_min_slab_ratio, .maxlen = sizeof(sysctl_min_slab_ratio), .mode = 0644, - .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, .extra1 = &zero, .extra2 = &one_hundred, }, #endif #ifdef CONFIG_SMP { - .ctl_name = CTL_UNNUMBERED, .procname = "stat_interval", .data = &sysctl_stat_interval, .maxlen = sizeof(sysctl_stat_interval), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "mmap_min_addr", .data = &dac_mmap_min_addr, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &mmap_min_addr_handler, + .proc_handler = mmap_min_addr_handler, }, #ifdef CONFIG_NUMA { - .ctl_name = CTL_UNNUMBERED, .procname = "numa_zonelist_order", .data = &numa_zonelist_order, .maxlen = NUMA_ZONELIST_ORDER_LEN, .mode = 0644, - .proc_handler = &numa_zonelist_order_handler, - .strategy = &sysctl_string, + .proc_handler = numa_zonelist_order_handler, }, #endif #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { - .ctl_name = VM_VDSO_ENABLED, .procname = "vdso_enabled", .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, #endif #ifdef CONFIG_HIGHMEM { - .ctl_name = CTL_UNNUMBERED, .procname = "highmem_is_dirtyable", .data = &vm_highmem_is_dirtyable, .maxlen = sizeof(vm_highmem_is_dirtyable), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "scan_unevictable_pages", .data = &scan_unevictable_pages, .maxlen = sizeof(scan_unevictable_pages), .mode = 0644, - .proc_handler = &scan_unevictable_handler, + .proc_handler = scan_unevictable_handler, }, #ifdef CONFIG_MEMORY_FAILURE { - .ctl_name = CTL_UNNUMBERED, .procname = "memory_failure_early_kill", .data = &sysctl_memory_failure_early_kill, .maxlen = sizeof(sysctl_memory_failure_early_kill), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "memory_failure_recovery", .data = &sysctl_memory_failure_recovery, .maxlen = sizeof(sysctl_memory_failure_recovery), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, @@ -1430,116 +1264,104 @@ static struct ctl_table vm_table[] = { * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) static struct ctl_table binfmt_misc_table[] = { - { .ctl_name = 0 } + { } }; #endif static struct ctl_table fs_table[] = { { - .ctl_name = FS_NRINODE, .procname = "inode-nr", .data = &inodes_stat, .maxlen = 2*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = FS_STATINODE, .procname = "inode-state", .data = &inodes_stat, .maxlen = 7*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .procname = "file-nr", .data = &files_stat, .maxlen = 3*sizeof(int), .mode = 0444, - .proc_handler = &proc_nr_files, + .proc_handler = proc_nr_files, }, { - .ctl_name = FS_MAXFILE, .procname = "file-max", .data = &files_stat.max_files, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "nr_open", .data = &sysctl_nr_open, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, { - .ctl_name = FS_DENTRY, .procname = "dentry-state", .data = &dentry_stat, .maxlen = 6*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = FS_OVERFLOWUID, .procname = "overflowuid", .data = &fs_overflowuid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, { - .ctl_name = FS_OVERFLOWGID, .procname = "overflowgid", .data = &fs_overflowgid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, #ifdef CONFIG_FILE_LOCKING { - .ctl_name = FS_LEASES, .procname = "leases-enable", .data = &leases_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_DNOTIFY { - .ctl_name = FS_DIR_NOTIFY, .procname = "dir-notify-enable", .data = &dir_notify_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_MMU #ifdef CONFIG_FILE_LOCKING { - .ctl_name = FS_LEASE_TIME, .procname = "lease-break-time", .data = &lease_break_time, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_AIO @@ -1548,19 +1370,18 @@ static struct ctl_table fs_table[] = { .data = &aio_nr, .maxlen = sizeof(aio_nr), .mode = 0444, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "aio-max-nr", .data = &aio_max_nr, .maxlen = sizeof(aio_max_nr), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, #endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { - .ctl_name = FS_INOTIFY, .procname = "inotify", .mode = 0555, .child = inotify_table, @@ -1575,19 +1396,16 @@ static struct ctl_table fs_table[] = { #endif #endif { - .ctl_name = KERN_SETUID_DUMPABLE, .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &two, }, #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) { - .ctl_name = CTL_UNNUMBERED, .procname = "binfmt_misc", .mode = 0555, .child = binfmt_misc_table, @@ -1597,13 +1415,12 @@ static struct ctl_table fs_table[] = { * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; static struct ctl_table debug_table[] = { #if defined(CONFIG_X86) || defined(CONFIG_PPC) { - .ctl_name = CTL_UNNUMBERED, .procname = "exception-trace", .data = &show_unhandled_signals, .maxlen = sizeof(int), @@ -1611,11 +1428,11 @@ static struct ctl_table debug_table[] = { .proc_handler = proc_dointvec }, #endif - { .ctl_name = 0 } + { } }; static struct ctl_table dev_table[] = { - { .ctl_name = 0 } + { } }; static DEFINE_SPINLOCK(sysctl_lock); @@ -1769,122 +1586,6 @@ void register_sysctl_root(struct ctl_table_root *root) spin_unlock(&sysctl_lock); } -#ifdef CONFIG_SYSCTL_SYSCALL -/* Perform the actual read/write of a sysctl table entry. */ -static int do_sysctl_strategy(struct ctl_table_root *root, - struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int op = 0, rc; - - if (oldval) - op |= MAY_READ; - if (newval) - op |= MAY_WRITE; - if (sysctl_perm(root, table, op)) - return -EPERM; - - if (table->strategy) { - rc = table->strategy(table, oldval, oldlenp, newval, newlen); - if (rc < 0) - return rc; - if (rc > 0) - return 0; - } - - /* If there is no strategy routine, or if the strategy returns - * zero, proceed with automatic r/w */ - if (table->data && table->maxlen) { - rc = sysctl_data(table, oldval, oldlenp, newval, newlen); - if (rc < 0) - return rc; - } - return 0; -} - -static int parse_table(int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - struct ctl_table_root *root, - struct ctl_table *table) -{ - int n; -repeat: - if (!nlen) - return -ENOTDIR; - if (get_user(n, name)) - return -EFAULT; - for ( ; table->ctl_name || table->procname; table++) { - if (!table->ctl_name) - continue; - if (n == table->ctl_name) { - int error; - if (table->child) { - if (sysctl_perm(root, table, MAY_EXEC)) - return -EPERM; - name++; - nlen--; - table = table->child; - goto repeat; - } - error = do_sysctl_strategy(root, table, - oldval, oldlenp, - newval, newlen); - return error; - } - } - return -ENOTDIR; -} - -int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ctl_table_header *head; - int error = -ENOTDIR; - - if (nlen <= 0 || nlen >= CTL_MAXNAME) - return -ENOTDIR; - if (oldval) { - int old_len; - if (!oldlenp || get_user(old_len, oldlenp)) - return -EFAULT; - } - - for (head = sysctl_head_next(NULL); head; - head = sysctl_head_next(head)) { - error = parse_table(name, nlen, oldval, oldlenp, - newval, newlen, - head->root, head->ctl_table); - if (error != -ENOTDIR) { - sysctl_head_finish(head); - break; - } - } - return error; -} - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - int error; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - error = deprecated_sysctl_warning(&tmp); - if (error) - goto out; - - lock_kernel(); - error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, - tmp.newval, tmp.newlen); - unlock_kernel(); -out: - return error; -} -#endif /* CONFIG_SYSCTL_SYSCALL */ - /* * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. @@ -1920,7 +1621,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) { - for (; table->ctl_name || table->procname; table++) { + for (; table->procname; table++) { table->parent = parent; if (table->child) sysctl_set_parent(table, table->child); @@ -1952,11 +1653,11 @@ static struct ctl_table *is_branch_in(struct ctl_table *branch, return NULL; /* ... and nothing else */ - if (branch[1].procname || branch[1].ctl_name) + if (branch[1].procname) return NULL; /* table should contain subdirectory with the same name */ - for (p = table; p->procname || p->ctl_name; p++) { + for (p = table; p->procname; p++) { if (!p->child) continue; if (p->procname && strcmp(p->procname, s) == 0) @@ -2001,9 +1702,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) * * The members of the &struct ctl_table structure are used as follows: * - * ctl_name - This is the numeric sysctl value used by sysctl(2). The number - * must be unique within that level of sysctl - * * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not * enter a sysctl file * @@ -2018,8 +1716,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) * * proc_handler - the text handler routine (described below) * - * strategy - the strategy routine (described below) - * * de - for internal use by the sysctl routines * * extra1, extra2 - extra pointers usable by the proc handler routines @@ -2032,19 +1728,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) * struct enable minimal validation of the values being written to be * performed, and the mode field allows minimal authentication. * - * More sophisticated management can be enabled by the provision of a - * strategy routine with the table entry. This will be called before - * any automatic read or write of the data is performed. - * - * The strategy routine may return - * - * < 0 - Error occurred (error is passed to user process) - * - * 0 - OK - proceed with automatic read or write. - * - * > 0 - OK - read or write has been done by the strategy routine, so - * return immediately. - * * There must be a proc_handler routine for any terminal nodes * mirrored under /proc/sys (non-terminals are handled by a built-in * directory handler). Several default handlers are available to @@ -2071,13 +1754,13 @@ struct ctl_table_header *__register_sysctl_paths( struct ctl_table_set *set; /* Count the path components */ - for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) + for (npath = 0; path[npath].procname; ++npath) ; /* * For each path component, allocate a 2-element ctl_table array. * The first array element will be filled with the sysctl entry - * for this, the second will be the sentinel (ctl_name == 0). + * for this, the second will be the sentinel (procname == 0). * * We allocate everything in one go so that we don't have to * worry about freeing additional memory in unregister_sysctl_table. @@ -2094,7 +1777,6 @@ struct ctl_table_header *__register_sysctl_paths( for (n = 0; n < npath; ++n, ++path) { /* Copy the procname */ new->procname = path->procname; - new->ctl_name = path->ctl_name; new->mode = 0555; *prevp = new; @@ -2956,286 +2638,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, #endif /* CONFIG_PROC_FS */ - -#ifdef CONFIG_SYSCTL_SYSCALL -/* - * General sysctl support routines - */ - -/* The generic sysctl data routine (used if no strategy routine supplied) */ -int sysctl_data(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - size_t len; - - /* Get out of I don't have a variable */ - if (!table->data || !table->maxlen) - return -ENOTDIR; - - if (oldval && oldlenp) { - if (get_user(len, oldlenp)) - return -EFAULT; - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, table->data, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - - if (newval && newlen) { - if (newlen > table->maxlen) - newlen = table->maxlen; - - if (copy_from_user(table->data, newval, newlen)) - return -EFAULT; - } - return 1; -} - -/* The generic string strategy routine: */ -int sysctl_string(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (!table->data || !table->maxlen) - return -ENOTDIR; - - if (oldval && oldlenp) { - size_t bufsize; - if (get_user(bufsize, oldlenp)) - return -EFAULT; - if (bufsize) { - size_t len = strlen(table->data), copied; - - /* This shouldn't trigger for a well-formed sysctl */ - if (len > table->maxlen) - len = table->maxlen; - - /* Copy up to a max of bufsize-1 bytes of the string */ - copied = (len >= bufsize) ? bufsize - 1 : len; - - if (copy_to_user(oldval, table->data, copied) || - put_user(0, (char __user *)(oldval + copied))) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - size_t len = newlen; - if (len > table->maxlen) - len = table->maxlen; - if(copy_from_user(table->data, newval, len)) - return -EFAULT; - if (len == table->maxlen) - len--; - ((char *) table->data)[len] = 0; - } - return 1; -} - -/* - * This function makes sure that all of the integers in the vector - * are between the minimum and maximum values given in the arrays - * table->extra1 and table->extra2, respectively. - */ -int sysctl_intvec(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - - if (newval && newlen) { - int __user *vec = (int __user *) newval; - int *min = (int *) table->extra1; - int *max = (int *) table->extra2; - size_t length; - int i; - - if (newlen % sizeof(int) != 0) - return -EINVAL; - - if (!table->extra1 && !table->extra2) - return 0; - - if (newlen > table->maxlen) - newlen = table->maxlen; - length = newlen / sizeof(int); - - for (i = 0; i < length; i++) { - int value; - if (get_user(value, vec + i)) - return -EFAULT; - if (min && value < min[i]) - return -EINVAL; - if (max && value > max[i]) - return -EINVAL; - } - } - return 0; -} - -/* Strategy function to convert jiffies to seconds */ -int sysctl_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (oldval && oldlenp) { - size_t olen; - - if (get_user(olen, oldlenp)) - return -EFAULT; - if (olen) { - int val; - - if (olen < sizeof(int)) - return -EINVAL; - - val = *(int *)(table->data) / HZ; - if (put_user(val, (int __user *)oldval)) - return -EFAULT; - if (put_user(sizeof(int), oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - int new; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(new, (int __user *)newval)) - return -EFAULT; - *(int *)(table->data) = new*HZ; - } - return 1; -} - -/* Strategy function to convert jiffies to seconds */ -int sysctl_ms_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (oldval && oldlenp) { - size_t olen; - - if (get_user(olen, oldlenp)) - return -EFAULT; - if (olen) { - int val; - - if (olen < sizeof(int)) - return -EINVAL; - - val = jiffies_to_msecs(*(int *)(table->data)); - if (put_user(val, (int __user *)oldval)) - return -EFAULT; - if (put_user(sizeof(int), oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - int new; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(new, (int __user *)newval)) - return -EFAULT; - *(int *)(table->data) = msecs_to_jiffies(new); - } - return 1; -} - - - -#else /* CONFIG_SYSCTL_SYSCALL */ - - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - int error; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - error = deprecated_sysctl_warning(&tmp); - - /* If no error reading the parameters then just -ENOSYS ... */ - if (!error) - error = -ENOSYS; - - return error; -} - -int sysctl_data(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_string(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_intvec(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_ms_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -#endif /* CONFIG_SYSCTL_SYSCALL */ - -static int deprecated_sysctl_warning(struct __sysctl_args *args) -{ - static int msg_count; - int name[CTL_MAXNAME]; - int i; - - /* Check args->nlen. */ - if (args->nlen < 0 || args->nlen > CTL_MAXNAME) - return -ENOTDIR; - - /* Read in the sysctl name for better debug message logging */ - for (i = 0; i < args->nlen; i++) - if (get_user(name[i], args->name + i)) - return -EFAULT; - - /* Ignore accesses to kernel.version */ - if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION)) - return 0; - - if (msg_count < 5) { - msg_count++; - printk(KERN_INFO - "warning: process `%s' used the deprecated sysctl " - "system call with ", current->comm); - for (i = 0; i < args->nlen; i++) - printk("%d.", name[i]); - printk("\n"); - } - return 0; -} - /* * No sense putting this after each symbol definition, twice, * exception granted :-) @@ -3250,9 +2652,4 @@ EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); EXPORT_SYMBOL(register_sysctl_table); EXPORT_SYMBOL(register_sysctl_paths); -EXPORT_SYMBOL(sysctl_intvec); -EXPORT_SYMBOL(sysctl_jiffies); -EXPORT_SYMBOL(sysctl_ms_jiffies); -EXPORT_SYMBOL(sysctl_string); -EXPORT_SYMBOL(sysctl_data); EXPORT_SYMBOL(unregister_sysctl_table); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c new file mode 100644 index 00000000000..b75dbf40f57 --- /dev/null +++ b/kernel/sysctl_binary.c @@ -0,0 +1,1507 @@ +#include <linux/stat.h> +#include <linux/sysctl.h> +#include "../fs/xfs/linux-2.6/xfs_sysctl.h" +#include <linux/sunrpc/debug.h> +#include <linux/string.h> +#include <net/ip_vs.h> +#include <linux/syscalls.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/fs.h> +#include <linux/nsproxy.h> +#include <linux/pid_namespace.h> +#include <linux/file.h> +#include <linux/ctype.h> +#include <linux/netdevice.h> + +#ifdef CONFIG_SYSCTL_SYSCALL + +struct bin_table; +typedef ssize_t bin_convert_t(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen); + +static bin_convert_t bin_dir; +static bin_convert_t bin_string; +static bin_convert_t bin_intvec; +static bin_convert_t bin_ulongvec; +static bin_convert_t bin_uuid; +static bin_convert_t bin_dn_node_address; + +#define CTL_DIR bin_dir +#define CTL_STR bin_string +#define CTL_INT bin_intvec +#define CTL_ULONG bin_ulongvec +#define CTL_UUID bin_uuid +#define CTL_DNADR bin_dn_node_address + +#define BUFSZ 256 + +struct bin_table { + bin_convert_t *convert; + int ctl_name; + const char *procname; + const struct bin_table *child; +}; + +static const struct bin_table bin_random_table[] = { + { CTL_INT, RANDOM_POOLSIZE, "poolsize" }, + { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" }, + { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" }, + { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, + { CTL_UUID, RANDOM_BOOT_ID, "boot_id" }, + { CTL_UUID, RANDOM_UUID, "uuid" }, + {} +}; + +static const struct bin_table bin_pty_table[] = { + { CTL_INT, PTY_MAX, "max" }, + { CTL_INT, PTY_NR, "nr" }, + {} +}; + +static const struct bin_table bin_kern_table[] = { + { CTL_STR, KERN_OSTYPE, "ostype" }, + { CTL_STR, KERN_OSRELEASE, "osrelease" }, + /* KERN_OSREV not used */ + { CTL_STR, KERN_VERSION, "version" }, + /* KERN_SECUREMASK not used */ + /* KERN_PROF not used */ + { CTL_STR, KERN_NODENAME, "hostname" }, + { CTL_STR, KERN_DOMAINNAME, "domainname" }, + + { CTL_INT, KERN_PANIC, "panic" }, + { CTL_INT, KERN_REALROOTDEV, "real-root-dev" }, + + { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" }, + { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" }, + { CTL_INT, KERN_PRINTK, "printk" }, + + /* KERN_NAMETRANS not used */ + /* KERN_PPC_HTABRECLAIM not used */ + /* KERN_PPC_ZEROPAGED not used */ + { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, + + { CTL_STR, KERN_MODPROBE, "modprobe" }, + { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" }, + { CTL_INT, KERN_ACCT, "acct" }, + /* KERN_PPC_L2CR "l2cr" no longer used */ + + /* KERN_RTSIGNR not used */ + /* KERN_RTSIGMAX not used */ + + { CTL_ULONG, KERN_SHMMAX, "shmmax" }, + { CTL_INT, KERN_MSGMAX, "msgmax" }, + { CTL_INT, KERN_MSGMNB, "msgmnb" }, + /* KERN_MSGPOOL not used*/ + { CTL_INT, KERN_SYSRQ, "sysrq" }, + { CTL_INT, KERN_MAX_THREADS, "threads-max" }, + { CTL_DIR, KERN_RANDOM, "random", bin_random_table }, + { CTL_ULONG, KERN_SHMALL, "shmall" }, + { CTL_INT, KERN_MSGMNI, "msgmni" }, + { CTL_INT, KERN_SEM, "sem" }, + { CTL_INT, KERN_SPARC_STOP_A, "stop-a" }, + { CTL_INT, KERN_SHMMNI, "shmmni" }, + + { CTL_INT, KERN_OVERFLOWUID, "overflowuid" }, + { CTL_INT, KERN_OVERFLOWGID, "overflowgid" }, + + { CTL_STR, KERN_HOTPLUG, "hotplug", }, + { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, + + { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, + { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" }, + /* KERN_TAINTED "tainted" no longer used */ + { CTL_INT, KERN_CADPID, "cad_pid" }, + { CTL_INT, KERN_PIDMAX, "pid_max" }, + { CTL_STR, KERN_CORE_PATTERN, "core_pattern" }, + { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" }, + { CTL_INT, KERN_HPPA_PWRSW, "soft-power" }, + { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" }, + + { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, + { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, + + { CTL_DIR, KERN_PTY, "pty", bin_pty_table }, + { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" }, + { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, + /* KERN_HZ_TIMER "hz_timer" no longer used */ + { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, + { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" }, + { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" }, + + { CTL_INT, KERN_SPIN_RETRY, "spin_retry" }, + /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */ + { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, + { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, + { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, + { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" }, + { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, + {} +}; + +static const struct bin_table bin_vm_table[] = { + { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, + { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" }, + { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, + { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, + /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ + /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ + { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, + { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, + /* VM_PAGEBUF unused */ + /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ + { CTL_INT, VM_SWAPPINESS, "swappiness" }, + { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, + { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" }, + { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" }, + { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" }, + { CTL_INT, VM_BLOCK_DUMP, "block_dump" }, + { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" }, + { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, + { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, + /* VM_SWAP_TOKEN_TIMEOUT unused */ + { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" }, + { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, + { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, + { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" }, + { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" }, + { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" }, + { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" }, + + {} +}; + +static const struct bin_table bin_net_core_table[] = { + { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" }, + { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" }, + { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" }, + { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" }, + /* NET_CORE_DESTROY_DELAY unused */ + { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, + /* NET_CORE_FASTROUTE unused */ + { CTL_INT, NET_CORE_MSG_COST, "message_cost" }, + { CTL_INT, NET_CORE_MSG_BURST, "message_burst" }, + { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" }, + /* NET_CORE_HOT_LIST_LENGTH unused */ + /* NET_CORE_DIVERT_VERSION unused */ + /* NET_CORE_NO_CONG_THRESH unused */ + /* NET_CORE_NO_CONG unused */ + /* NET_CORE_LO_CONG unused */ + /* NET_CORE_MOD_CONG unused */ + { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" }, + { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" }, + { CTL_INT, NET_CORE_BUDGET, "netdev_budget" }, + { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, + { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, + { CTL_INT, NET_CORE_WARNINGS, "warnings" }, + {}, +}; + +static const struct bin_table bin_net_unix_table[] = { + /* NET_UNIX_DESTROY_DELAY unused */ + /* NET_UNIX_DELETE_DELAY unused */ + { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, + {} +}; + +static const struct bin_table bin_net_ipv4_route_table[] = { + { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" }, + /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */ + /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */ + { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, + { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, + { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, + { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, + { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, + { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, + { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, + { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, + { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, + { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, + { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, + { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, + { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" }, + {} +}; + +static const struct bin_table bin_net_ipv4_conf_vars_table[] = { + { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" }, + { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, + + { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, + { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, + { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, + { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, + { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" }, + { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, + { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, + { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, + { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, + { CTL_INT, NET_IPV4_CONF_TAG, "tag" }, + { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" }, + { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, + { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, + { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, + { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, + + { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, + { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" }, + { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, + { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, + {} +}; + +static const struct bin_table bin_net_ipv4_conf_table[] = { + { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table }, + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table }, + { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table }, + {} +}; + +static const struct bin_table bin_net_neigh_vars_table[] = { + { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, + { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, + { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" }, + /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */ + { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, + { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, + { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, + { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" }, + { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, + /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */ + /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */ + /* NET_NEIGH_LOCKTIME "locktime" no longer used */ + { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" }, + { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" }, + { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" }, + { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, + { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, + {} +}; + +static const struct bin_table bin_net_neigh_table[] = { + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table }, + { CTL_DIR, 0, NULL, bin_net_neigh_vars_table }, + {} +}; + +static const struct bin_table bin_net_ipv4_netfilter_table[] = { + { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, + + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */ + + /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */ + /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */ + /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */ + /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */ + + { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */ + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, + + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ + + { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, + {} +}; + +static const struct bin_table bin_net_ipv4_table[] = { + {CTL_INT, NET_IPV4_FORWARD, "ip_forward" }, + + { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table }, + { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table }, + { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table }, + /* NET_IPV4_FIB_HASH unused */ + { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table }, + + { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, + { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, + { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" }, + { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, + { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, + /* NET_IPV4_AUTOCONFIG unused */ + { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, + { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, + { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, + { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, + { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, + { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, + { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, + { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, + { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, + { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, + { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, + { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, + { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, + { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" }, + { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" }, + { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, + { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, + { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, + { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, + { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, + { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, + { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, + { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, + { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, + { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, + { CTL_INT, NET_TCP_FACK, "tcp_fack" }, + { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" }, + { CTL_INT, NET_TCP_ECN, "tcp_ecn" }, + { CTL_INT, NET_TCP_DSACK, "tcp_dsack" }, + { CTL_INT, NET_TCP_MEM, "tcp_mem" }, + { CTL_INT, NET_TCP_WMEM, "tcp_wmem" }, + { CTL_INT, NET_TCP_RMEM, "tcp_rmem" }, + { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" }, + { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, + { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" }, + { CTL_INT, NET_TCP_FRTO, "tcp_frto" }, + { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, + { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" }, + { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, + { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, + { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, + { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, + { CTL_INT, NET_TCP_ABC, "tcp_abc" }, + { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, + { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, + { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, + { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, + { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, + { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, + { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, + { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, + { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, + /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */ + { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, + { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, + + { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, + { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, + { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, + { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, + { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, + { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, + + { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, + { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, + { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, + + { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, + /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */ + + { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, + + /* NET_TCP_DEFAULT_WIN_SCALE unused */ + /* NET_TCP_BIC_BETA unused */ + /* NET_IPV4_TCP_MAX_KA_PROBES unused */ + /* NET_IPV4_IP_MASQ_DEBUG unused */ + /* NET_TCP_SYN_TAILDROP unused */ + /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ + /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ + /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ + /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ + /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ + /* NET_IPV4_ALWAYS_DEFRAG unused */ + {} +}; + +static const struct bin_table bin_net_ipx_table[] = { + { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, + /* NET_IPX_FORWARDING unused */ + {} +}; + +static const struct bin_table bin_net_atalk_table[] = { + { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, + { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, + { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, + { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, + {}, +}; + +static const struct bin_table bin_net_netrom_table[] = { + { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, + { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, + { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, + { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, + { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, + { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, + { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, + { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, + { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, + { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" }, + { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, + { CTL_INT, NET_NETROM_RESET, "reset" }, + {} +}; + +static const struct bin_table bin_net_ax25_param_table[] = { + { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, + { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, + { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" }, + { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" }, + { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" }, + { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, + { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" }, + { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" }, + { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" }, + { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, + { CTL_INT, NET_AX25_N2, "maximum_retry_count" }, + { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" }, + { CTL_INT, NET_AX25_PROTOCOL, "protocol" }, + { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, + {} +}; + +static const struct bin_table bin_net_ax25_table[] = { + { CTL_DIR, 0, NULL, bin_net_ax25_param_table }, + {} +}; + +static const struct bin_table bin_net_rose_table[] = { + { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, + { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, + { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, + { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, + { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, + { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" }, + { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, + { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, + { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" }, + { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, + {} +}; + +static const struct bin_table bin_net_ipv6_conf_var_table[] = { + { CTL_INT, NET_IPV6_FORWARDING, "forwarding" }, + { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" }, + { CTL_INT, NET_IPV6_MTU, "mtu" }, + { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" }, + { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, + { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" }, + { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, + { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" }, + { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, + { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, + { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, + { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, + { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, + { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, + { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, + { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" }, + { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, + { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, + { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, + { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + {} +}; + +static const struct bin_table bin_net_ipv6_conf_table[] = { + { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table }, + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table }, + { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table }, + {} +}; + +static const struct bin_table bin_net_ipv6_route_table[] = { + /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */ + { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, + { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, + { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, + { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, + { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, + { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, + { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, + { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, + {} +}; + +static const struct bin_table bin_net_ipv6_icmp_table[] = { + { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, + {} +}; + +static const struct bin_table bin_net_ipv6_table[] = { + { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table }, + { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table }, + { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table }, + { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table }, + { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" }, + { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, + { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, + { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, + { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, + { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, + { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, + {} +}; + +static const struct bin_table bin_net_x25_table[] = { + { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, + { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, + { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, + { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, + { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, + { CTL_INT, NET_X25_FORWARD, "x25_forward" }, + {} +}; + +static const struct bin_table bin_net_tr_table[] = { + { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" }, + {} +}; + + +static const struct bin_table bin_net_decnet_conf_vars[] = { + { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, + { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" }, + { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" }, + { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" }, + {} +}; + +static const struct bin_table bin_net_decnet_conf[] = { + { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars }, + { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars }, + {} +}; + +static const struct bin_table bin_net_decnet_table[] = { + { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf }, + { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" }, + { CTL_STR, NET_DECNET_NODE_NAME, "node_name" }, + { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" }, + { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" }, + { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" }, + { CTL_INT, NET_DECNET_DI_COUNT, "di_count" }, + { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" }, + { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, + { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, + { CTL_INT, NET_DECNET_MEM, "decnet_mem" }, + { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" }, + { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" }, + { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" }, + {} +}; + +static const struct bin_table bin_net_sctp_table[] = { + { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" }, + { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" }, + { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" }, + { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, + { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, + { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, + { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, + { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, + { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, + { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" }, + { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, + { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" }, + { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" }, + { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, + { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, + { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, + { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, + {} +}; + +static const struct bin_table bin_net_llc_llc2_timeout_table[] = { + { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" }, + { CTL_INT, NET_LLC2_P_TIMEOUT, "p" }, + { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" }, + { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" }, + {} +}; + +static const struct bin_table bin_net_llc_station_table[] = { + { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, + {} +}; + +static const struct bin_table bin_net_llc_llc2_table[] = { + { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table }, + {} +}; + +static const struct bin_table bin_net_llc_table[] = { + { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table }, + { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table }, + {} +}; + +static const struct bin_table bin_net_netfilter_table[] = { + { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, + /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */ + /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */ + /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */ + /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */ + /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, + { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, + /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, + { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, + { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, + /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */ + /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, + { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, + { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, + + {} +}; + +static const struct bin_table bin_net_irda_table[] = { + { CTL_INT, NET_IRDA_DISCOVERY, "discovery" }, + { CTL_STR, NET_IRDA_DEVNAME, "devname" }, + { CTL_INT, NET_IRDA_DEBUG, "debug" }, + { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" }, + { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, + { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, + { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, + { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, + { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, + { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, + { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, + { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, + { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, + { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, + {} +}; + +static const struct bin_table bin_net_table[] = { + { CTL_DIR, NET_CORE, "core", bin_net_core_table }, + /* NET_ETHER not used */ + /* NET_802 not used */ + { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table }, + { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table }, + { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table }, + { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table }, + { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table }, + { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table }, + /* NET_BRIDGE "bridge" no longer used */ + { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table }, + { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table }, + { CTL_DIR, NET_X25, "x25", bin_net_x25_table }, + { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table }, + { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table }, + /* NET_ECONET not used */ + { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table }, + { CTL_DIR, NET_LLC, "llc", bin_net_llc_table }, + { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table }, + /* NET_DCCP "dccp" no longer used */ + { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table }, + { CTL_INT, 2089, "nf_conntrack_max" }, + {} +}; + +static const struct bin_table bin_fs_quota_table[] = { + { CTL_INT, FS_DQ_LOOKUPS, "lookups" }, + { CTL_INT, FS_DQ_DROPS, "drops" }, + { CTL_INT, FS_DQ_READS, "reads" }, + { CTL_INT, FS_DQ_WRITES, "writes" }, + { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" }, + { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" }, + { CTL_INT, FS_DQ_FREE, "free_dquots" }, + { CTL_INT, FS_DQ_SYNCS, "syncs" }, + { CTL_INT, FS_DQ_WARNINGS, "warnings" }, + {} +}; + +static const struct bin_table bin_fs_xfs_table[] = { + { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" }, + { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" }, + { CTL_INT, XFS_PANIC_MASK, "panic_mask" }, + + { CTL_INT, XFS_ERRLEVEL, "error_level" }, + { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, + { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" }, + { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" }, + { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" }, + { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" }, + { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" }, + { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, + { CTL_INT, XFS_ROTORSTEP, "rotorstep" }, + { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" }, + { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" }, + { CTL_INT, XFS_STATS_CLEAR, "stats_clear" }, + {} +}; + +static const struct bin_table bin_fs_ocfs2_nm_table[] = { + { CTL_STR, 1, "hb_ctl_path" }, + {} +}; + +static const struct bin_table bin_fs_ocfs2_table[] = { + { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table }, + {} +}; + +static const struct bin_table bin_inotify_table[] = { + { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, + { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, + { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, + {} +}; + +static const struct bin_table bin_fs_table[] = { + { CTL_INT, FS_NRINODE, "inode-nr" }, + { CTL_INT, FS_STATINODE, "inode-state" }, + /* FS_MAXINODE unused */ + /* FS_NRDQUOT unused */ + /* FS_MAXDQUOT unused */ + /* FS_NRFILE "file-nr" no longer used */ + { CTL_INT, FS_MAXFILE, "file-max" }, + { CTL_INT, FS_DENTRY, "dentry-state" }, + /* FS_NRSUPER unused */ + /* FS_MAXUPSER unused */ + { CTL_INT, FS_OVERFLOWUID, "overflowuid" }, + { CTL_INT, FS_OVERFLOWGID, "overflowgid" }, + { CTL_INT, FS_LEASES, "leases-enable" }, + { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" }, + { CTL_INT, FS_LEASE_TIME, "lease-break-time" }, + { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table }, + { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table }, + { CTL_ULONG, FS_AIO_NR, "aio-nr" }, + { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" }, + { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table }, + { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table }, + { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" }, + {} +}; + +static const struct bin_table bin_ipmi_table[] = { + { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, + {} +}; + +static const struct bin_table bin_mac_hid_files[] = { + /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ + /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, + /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ + {} +}; + +static const struct bin_table bin_raid_table[] = { + { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, + { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, + {} +}; + +static const struct bin_table bin_scsi_table[] = { + { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" }, + {} +}; + +static const struct bin_table bin_dev_table[] = { + /* DEV_CDROM "cdrom" no longer used */ + /* DEV_HWMON unused */ + /* DEV_PARPORT "parport" no longer used */ + { CTL_DIR, DEV_RAID, "raid", bin_raid_table }, + { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files }, + { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table }, + { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table }, + {} +}; + +static const struct bin_table bin_bus_isa_table[] = { + { CTL_INT, BUS_ISA_MEM_BASE, "membase" }, + { CTL_INT, BUS_ISA_PORT_BASE, "portbase" }, + { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" }, + {} +}; + +static const struct bin_table bin_bus_table[] = { + { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table }, + {} +}; + + +static const struct bin_table bin_s390dbf_table[] = { + { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, + { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, + {} +}; + +static const struct bin_table bin_sunrpc_table[] = { + /* CTL_RPCDEBUG "rpc_debug" no longer used */ + /* CTL_NFSDEBUG "nfs_debug" no longer used */ + /* CTL_NFSDDEBUG "nfsd_debug" no longer used */ + /* CTL_NLMDEBUG "nlm_debug" no longer used */ + + { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, + { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, + { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" }, + { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" }, + {} +}; + +static const struct bin_table bin_pm_table[] = { + /* frv specific */ + /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */ + { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" }, + { CTL_INT, 3 /* CTL_PM_P0 */, "p0" }, + { CTL_INT, 4 /* CTL_PM_CM */, "cm" }, + {} +}; + +static const struct bin_table bin_root_table[] = { + { CTL_DIR, CTL_KERN, "kernel", bin_kern_table }, + { CTL_DIR, CTL_VM, "vm", bin_vm_table }, + { CTL_DIR, CTL_NET, "net", bin_net_table }, + /* CTL_PROC not used */ + { CTL_DIR, CTL_FS, "fs", bin_fs_table }, + /* CTL_DEBUG "debug" no longer used */ + { CTL_DIR, CTL_DEV, "dev", bin_dev_table }, + { CTL_DIR, CTL_BUS, "bus", bin_bus_table }, + { CTL_DIR, CTL_ABI, "abi" }, + /* CTL_CPU not used */ + /* CTL_ARLAN "arlan" no longer used */ + { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table }, + { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table }, + { CTL_DIR, CTL_PM, "pm", bin_pm_table }, + {} +}; + +static ssize_t bin_dir(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + return -ENOTDIR; +} + + +static ssize_t bin_string(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + ssize_t result, copied = 0; + + if (oldval && oldlen) { + char __user *lastp; + loff_t pos = 0; + int ch; + + result = vfs_read(file, oldval, oldlen, &pos); + if (result < 0) + goto out; + + copied = result; + lastp = oldval + copied - 1; + + result = -EFAULT; + if (get_user(ch, lastp)) + goto out; + + /* Trim off the trailing newline */ + if (ch == '\n') { + result = -EFAULT; + if (put_user('\0', lastp)) + goto out; + copied -= 1; + } + } + + if (newval && newlen) { + loff_t pos = 0; + + result = vfs_write(file, newval, newlen, &pos); + if (result < 0) + goto out; + } + + result = copied; +out: + return result; +} + +static ssize_t bin_intvec(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t copied = 0; + char *buffer; + ssize_t result; + + result = -ENOMEM; + buffer = kmalloc(BUFSZ, GFP_KERNEL); + if (!buffer) + goto out; + + if (oldval && oldlen) { + unsigned __user *vec = oldval; + size_t length = oldlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buffer, BUFSZ - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + + str = buffer; + end = str + result; + *end++ = '\0'; + for (i = 0; i < length; i++) { + unsigned long value; + + value = simple_strtoul(str, &str, 10); + while (isspace(*str)) + str++; + + result = -EFAULT; + if (put_user(value, vec + i)) + goto out_kfree; + + copied += sizeof(*vec); + if (!isdigit(*str)) + break; + } + } + + if (newval && newlen) { + unsigned __user *vec = newval; + size_t length = newlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + str = buffer; + end = str + BUFSZ; + for (i = 0; i < length; i++) { + unsigned long value; + + result = -EFAULT; + if (get_user(value, vec + i)) + goto out_kfree; + + str += snprintf(str, end - str, "%lu\t", value); + } + + set_fs(KERNEL_DS); + result = vfs_write(file, buffer, str - buffer, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + } + result = copied; +out_kfree: + kfree(buffer); +out: + return result; +} + +static ssize_t bin_ulongvec(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t copied = 0; + char *buffer; + ssize_t result; + + result = -ENOMEM; + buffer = kmalloc(BUFSZ, GFP_KERNEL); + if (!buffer) + goto out; + + if (oldval && oldlen) { + unsigned long __user *vec = oldval; + size_t length = oldlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buffer, BUFSZ - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + + str = buffer; + end = str + result; + *end++ = '\0'; + for (i = 0; i < length; i++) { + unsigned long value; + + value = simple_strtoul(str, &str, 10); + while (isspace(*str)) + str++; + + result = -EFAULT; + if (put_user(value, vec + i)) + goto out_kfree; + + copied += sizeof(*vec); + if (!isdigit(*str)) + break; + } + } + + if (newval && newlen) { + unsigned long __user *vec = newval; + size_t length = newlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + str = buffer; + end = str + BUFSZ; + for (i = 0; i < length; i++) { + unsigned long value; + + result = -EFAULT; + if (get_user(value, vec + i)) + goto out_kfree; + + str += snprintf(str, end - str, "%lu\t", value); + } + + set_fs(KERNEL_DS); + result = vfs_write(file, buffer, str - buffer, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + } + result = copied; +out_kfree: + kfree(buffer); +out: + return result; +} + +static unsigned hex_value(int ch) +{ + return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10; +} + +static ssize_t bin_uuid(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t result, copied = 0; + + /* Only supports reads */ + if (oldval && oldlen) { + loff_t pos = 0; + char buf[40], *str = buf; + unsigned char uuid[16]; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buf, sizeof(buf) - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + + buf[result] = '\0'; + + /* Convert the uuid to from a string to binary */ + for (i = 0; i < 16; i++) { + result = -EIO; + if (!isxdigit(str[0]) || !isxdigit(str[1])) + goto out; + + uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]); + str += 2; + if (*str == '-') + str++; + } + + if (oldlen > 16) + oldlen = 16; + + result = -EFAULT; + if (copy_to_user(oldval, uuid, oldlen)) + goto out; + + copied = oldlen; + } + result = copied; +out: + return result; +} + +static ssize_t bin_dn_node_address(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t result, copied = 0; + + if (oldval && oldlen) { + loff_t pos = 0; + char buf[15], *nodep; + unsigned long area, node; + __le16 dnaddr; + + set_fs(KERNEL_DS); + result = vfs_read(file, buf, sizeof(buf) - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + + buf[result] = '\0'; + + /* Convert the decnet addresss to binary */ + result = -EIO; + nodep = strchr(buf, '.') + 1; + if (!nodep) + goto out; + + area = simple_strtoul(buf, NULL, 10); + node = simple_strtoul(nodep, NULL, 10); + + result = -EIO; + if ((area > 63)||(node > 1023)) + goto out; + + dnaddr = cpu_to_le16((area << 10) | node); + + result = -EFAULT; + if (put_user(dnaddr, (__le16 __user *)oldval)) + goto out; + + copied = sizeof(dnaddr); + } + + if (newval && newlen) { + loff_t pos = 0; + __le16 dnaddr; + char buf[15]; + int len; + + result = -EINVAL; + if (newlen != sizeof(dnaddr)) + goto out; + + result = -EFAULT; + if (get_user(dnaddr, (__le16 __user *)newval)) + goto out; + + len = snprintf(buf, sizeof(buf), "%hu.%hu", + le16_to_cpu(dnaddr) >> 10, + le16_to_cpu(dnaddr) & 0x3ff); + + set_fs(KERNEL_DS); + result = vfs_write(file, buf, len, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + } + + result = copied; +out: + return result; +} + +static const struct bin_table *get_sysctl(const int *name, int nlen, char *path) +{ + const struct bin_table *table = &bin_root_table[0]; + int ctl_name; + + /* The binary sysctl tables have a small maximum depth so + * there is no danger of overflowing our path as it PATH_MAX + * bytes long. + */ + memcpy(path, "sys/", 4); + path += 4; + +repeat: + if (!nlen) + return ERR_PTR(-ENOTDIR); + ctl_name = *name; + name++; + nlen--; + for ( ; table->convert; table++) { + int len = 0; + + /* + * For a wild card entry map from ifindex to network + * device name. + */ + if (!table->ctl_name) { +#ifdef CONFIG_NET + struct net *net = current->nsproxy->net_ns; + struct net_device *dev; + dev = dev_get_by_index(net, ctl_name); + if (dev) { + len = strlen(dev->name); + memcpy(path, dev->name, len); + dev_put(dev); + } +#endif + /* Use the well known sysctl number to proc name mapping */ + } else if (ctl_name == table->ctl_name) { + len = strlen(table->procname); + memcpy(path, table->procname, len); + } + if (len) { + path += len; + if (table->child) { + *path++ = '/'; + table = table->child; + goto repeat; + } + *path = '\0'; + return table; + } + } + return ERR_PTR(-ENOTDIR); +} + +static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep) +{ + char *tmp, *result; + + result = ERR_PTR(-ENOMEM); + tmp = __getname(); + if (tmp) { + const struct bin_table *table = get_sysctl(name, nlen, tmp); + result = tmp; + *tablep = table; + if (IS_ERR(table)) { + __putname(tmp); + result = ERR_CAST(table); + } + } + return result; +} + +static ssize_t binary_sysctl(const int *name, int nlen, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + const struct bin_table *table = NULL; + struct nameidata nd; + struct vfsmount *mnt; + struct file *file; + ssize_t result; + char *pathname; + int flags; + int acc_mode, fmode; + + pathname = sysctl_getname(name, nlen, &table); + result = PTR_ERR(pathname); + if (IS_ERR(pathname)) + goto out; + + /* How should the sysctl be accessed? */ + if (oldval && oldlen && newval && newlen) { + flags = O_RDWR; + acc_mode = MAY_READ | MAY_WRITE; + fmode = FMODE_READ | FMODE_WRITE; + } else if (newval && newlen) { + flags = O_WRONLY; + acc_mode = MAY_WRITE; + fmode = FMODE_WRITE; + } else if (oldval && oldlen) { + flags = O_RDONLY; + acc_mode = MAY_READ; + fmode = FMODE_READ; + } else { + result = 0; + goto out_putname; + } + + mnt = current->nsproxy->pid_ns->proc_mnt; + result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); + if (result) + goto out_putname; + + result = may_open(&nd.path, acc_mode, fmode); + if (result) + goto out_putpath; + + file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred()); + result = PTR_ERR(file); + if (IS_ERR(file)) + goto out_putname; + + result = table->convert(file, oldval, oldlen, newval, newlen); + + fput(file); +out_putname: + putname(pathname); +out: + return result; + +out_putpath: + path_put(&nd.path); + goto out_putname; +} + + +#else /* CONFIG_SYSCTL_SYSCALL */ + +static ssize_t binary_sysctl(const int *name, int nlen, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + return -ENOSYS; +} + +#endif /* CONFIG_SYSCTL_SYSCALL */ + + +static void deprecated_sysctl_warning(const int *name, int nlen) +{ + int i; + + if (printk_ratelimit()) { + printk(KERN_INFO + "warning: process `%s' used the deprecated sysctl " + "system call with ", current->comm); + for (i = 0; i < nlen; i++) + printk("%d.", name[i]); + printk("\n"); + } + return; +} + +static ssize_t do_sysctl(int __user *args_name, int nlen, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + int name[CTL_MAXNAME]; + int i; + + /* Check args->nlen. */ + if (nlen < 0 || nlen > CTL_MAXNAME) + return -ENOTDIR; + /* Read in the sysctl name for simplicity */ + for (i = 0; i < nlen; i++) + if (get_user(name[i], args_name + i)) + return -EFAULT; + + deprecated_sysctl_warning(name, nlen); + + return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); +} + +SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) +{ + struct __sysctl_args tmp; + size_t oldlen = 0; + ssize_t result; + + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + + if (tmp.oldval && !tmp.oldlenp) + return -EFAULT; + + if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp)) + return -EFAULT; + + result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, + tmp.newval, tmp.newlen); + + if (result >= 0) { + oldlen = result; + result = 0; + } + + if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp)) + return -EFAULT; + + return result; +} + + +#ifdef CONFIG_COMPAT +#include <asm/compat.h> + +struct compat_sysctl_args { + compat_uptr_t name; + int nlen; + compat_uptr_t oldval; + compat_uptr_t oldlenp; + compat_uptr_t newval; + compat_size_t newlen; + compat_ulong_t __unused[4]; +}; + +asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args) +{ + struct compat_sysctl_args tmp; + compat_size_t __user *compat_oldlenp; + size_t oldlen = 0; + ssize_t result; + + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + + if (tmp.oldval && !tmp.oldlenp) + return -EFAULT; + + compat_oldlenp = compat_ptr(tmp.oldlenp); + if (compat_oldlenp && get_user(oldlen, compat_oldlenp)) + return -EFAULT; + + result = do_sysctl(compat_ptr(tmp.name), tmp.nlen, + compat_ptr(tmp.oldval), oldlen, + compat_ptr(tmp.newval), tmp.newlen); + + if (result >= 0) { + oldlen = result; + result = 0; + } + + if (compat_oldlenp && put_user(oldlen, compat_oldlenp)) + return -EFAULT; + + return result; +} + +#endif /* CONFIG_COMPAT */ diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index f1d676e4b36..04cdcf72c82 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -5,1240 +5,6 @@ #include <linux/string.h> #include <net/ip_vs.h> -struct trans_ctl_table { - int ctl_name; - const char *procname; - const struct trans_ctl_table *child; -}; - -static const struct trans_ctl_table trans_random_table[] = { - { RANDOM_POOLSIZE, "poolsize" }, - { RANDOM_ENTROPY_COUNT, "entropy_avail" }, - { RANDOM_READ_THRESH, "read_wakeup_threshold" }, - { RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, - { RANDOM_BOOT_ID, "boot_id" }, - { RANDOM_UUID, "uuid" }, - {} -}; - -static const struct trans_ctl_table trans_pty_table[] = { - { PTY_MAX, "max" }, - { PTY_NR, "nr" }, - {} -}; - -static const struct trans_ctl_table trans_kern_table[] = { - { KERN_OSTYPE, "ostype" }, - { KERN_OSRELEASE, "osrelease" }, - /* KERN_OSREV not used */ - { KERN_VERSION, "version" }, - /* KERN_SECUREMASK not used */ - /* KERN_PROF not used */ - { KERN_NODENAME, "hostname" }, - { KERN_DOMAINNAME, "domainname" }, - - { KERN_PANIC, "panic" }, - { KERN_REALROOTDEV, "real-root-dev" }, - - { KERN_SPARC_REBOOT, "reboot-cmd" }, - { KERN_CTLALTDEL, "ctrl-alt-del" }, - { KERN_PRINTK, "printk" }, - - /* KERN_NAMETRANS not used */ - /* KERN_PPC_HTABRECLAIM not used */ - /* KERN_PPC_ZEROPAGED not used */ - { KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, - - { KERN_MODPROBE, "modprobe" }, - { KERN_SG_BIG_BUFF, "sg-big-buff" }, - { KERN_ACCT, "acct" }, - { KERN_PPC_L2CR, "l2cr" }, - - /* KERN_RTSIGNR not used */ - /* KERN_RTSIGMAX not used */ - - { KERN_SHMMAX, "shmmax" }, - { KERN_MSGMAX, "msgmax" }, - { KERN_MSGMNB, "msgmnb" }, - /* KERN_MSGPOOL not used*/ - { KERN_SYSRQ, "sysrq" }, - { KERN_MAX_THREADS, "threads-max" }, - { KERN_RANDOM, "random", trans_random_table }, - { KERN_SHMALL, "shmall" }, - { KERN_MSGMNI, "msgmni" }, - { KERN_SEM, "sem" }, - { KERN_SPARC_STOP_A, "stop-a" }, - { KERN_SHMMNI, "shmmni" }, - - { KERN_OVERFLOWUID, "overflowuid" }, - { KERN_OVERFLOWGID, "overflowgid" }, - - { KERN_HOTPLUG, "hotplug", }, - { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, - - { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, - { KERN_CORE_USES_PID, "core_uses_pid" }, - { KERN_TAINTED, "tainted" }, - { KERN_CADPID, "cad_pid" }, - { KERN_PIDMAX, "pid_max" }, - { KERN_CORE_PATTERN, "core_pattern" }, - { KERN_PANIC_ON_OOPS, "panic_on_oops" }, - { KERN_HPPA_PWRSW, "soft-power" }, - { KERN_HPPA_UNALIGNED, "unaligned-trap" }, - - { KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, - { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, - - { KERN_PTY, "pty", trans_pty_table }, - { KERN_NGROUPS_MAX, "ngroups_max" }, - { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, - { KERN_HZ_TIMER, "hz_timer" }, - { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, - { KERN_BOOTLOADER_TYPE, "bootloader_type" }, - { KERN_RANDOMIZE, "randomize_va_space" }, - - { KERN_SPIN_RETRY, "spin_retry" }, - { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" }, - { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, - { KERN_COMPAT_LOG, "compat-log" }, - { KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, - { KERN_NMI_WATCHDOG, "nmi_watchdog" }, - { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, - {} -}; - -static const struct trans_ctl_table trans_vm_table[] = { - { VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, - { VM_PAGE_CLUSTER, "page-cluster" }, - { VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, - { VM_DIRTY_RATIO, "dirty_ratio" }, - { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" }, - { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" }, - { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, - { VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, - /* VM_PAGEBUF unused */ - { VM_HUGETLB_PAGES, "nr_hugepages" }, - { VM_SWAPPINESS, "swappiness" }, - { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, - { VM_MIN_FREE_KBYTES, "min_free_kbytes" }, - { VM_MAX_MAP_COUNT, "max_map_count" }, - { VM_LAPTOP_MODE, "laptop_mode" }, - { VM_BLOCK_DUMP, "block_dump" }, - { VM_HUGETLB_GROUP, "hugetlb_shm_group" }, - { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, - { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, - /* VM_SWAP_TOKEN_TIMEOUT unused */ - { VM_DROP_PAGECACHE, "drop_caches" }, - { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, - { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, - { VM_MIN_UNMAPPED, "min_unmapped_ratio" }, - { VM_PANIC_ON_OOM, "panic_on_oom" }, - { VM_VDSO_ENABLED, "vdso_enabled" }, - { VM_MIN_SLAB, "min_slab_ratio" }, - - {} -}; - -static const struct trans_ctl_table trans_net_core_table[] = { - { NET_CORE_WMEM_MAX, "wmem_max" }, - { NET_CORE_RMEM_MAX, "rmem_max" }, - { NET_CORE_WMEM_DEFAULT, "wmem_default" }, - { NET_CORE_RMEM_DEFAULT, "rmem_default" }, - /* NET_CORE_DESTROY_DELAY unused */ - { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, - /* NET_CORE_FASTROUTE unused */ - { NET_CORE_MSG_COST, "message_cost" }, - { NET_CORE_MSG_BURST, "message_burst" }, - { NET_CORE_OPTMEM_MAX, "optmem_max" }, - /* NET_CORE_HOT_LIST_LENGTH unused */ - /* NET_CORE_DIVERT_VERSION unused */ - /* NET_CORE_NO_CONG_THRESH unused */ - /* NET_CORE_NO_CONG unused */ - /* NET_CORE_LO_CONG unused */ - /* NET_CORE_MOD_CONG unused */ - { NET_CORE_DEV_WEIGHT, "dev_weight" }, - { NET_CORE_SOMAXCONN, "somaxconn" }, - { NET_CORE_BUDGET, "netdev_budget" }, - { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, - { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, - { NET_CORE_WARNINGS, "warnings" }, - {}, -}; - -static const struct trans_ctl_table trans_net_unix_table[] = { - /* NET_UNIX_DESTROY_DELAY unused */ - /* NET_UNIX_DELETE_DELAY unused */ - { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_route_table[] = { - { NET_IPV4_ROUTE_FLUSH, "flush" }, - { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" }, - { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" }, - { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, - { NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, - { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, - { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, - { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, - { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, - { NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, - { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, - { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, - { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" }, - { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = { - { NET_IPV4_CONF_FORWARDING, "forwarding" }, - { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, - - { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, - { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, - { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, - { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, - { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, - { NET_IPV4_CONF_RP_FILTER, "rp_filter" }, - { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, - { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, - { NET_IPV4_CONF_TAG, "tag" }, - { NET_IPV4_CONF_ARPFILTER, "arp_filter" }, - { NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, - { NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, - { NET_IPV4_CONF_NOPOLICY, "disable_policy" }, - { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, - - { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, - { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, - { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, - { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, - { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, - { NET_IPV4_CONF_ACCEPT_LOCAL, "accept_local" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_conf_table[] = { - { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table }, - { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table }, - { 0, NULL, trans_net_ipv4_conf_vars_table }, - {} -}; - -static const struct trans_ctl_table trans_net_neigh_vars_table[] = { - { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, - { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, - { NET_NEIGH_APP_SOLICIT, "app_solicit" }, - { NET_NEIGH_RETRANS_TIME, "retrans_time" }, - { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, - { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, - { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, - { NET_NEIGH_UNRES_QLEN, "unres_qlen" }, - { NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, - { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" }, - { NET_NEIGH_PROXY_DELAY, "proxy_delay" }, - { NET_NEIGH_LOCKTIME, "locktime" }, - { NET_NEIGH_GC_INTERVAL, "gc_interval" }, - { NET_NEIGH_GC_THRESH1, "gc_thresh1" }, - { NET_NEIGH_GC_THRESH2, "gc_thresh2" }, - { NET_NEIGH_GC_THRESH3, "gc_thresh3" }, - { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, - { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_neigh_table[] = { - { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table }, - { 0, NULL, trans_net_neigh_vars_table }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = { - { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, - - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" }, - - { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" }, - { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" }, - { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" }, - { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" }, - - { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, - { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" }, - { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, - { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, - { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, - - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" }, - - { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, - { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_table[] = { - { NET_IPV4_FORWARD, "ip_forward" }, - { NET_IPV4_DYNADDR, "ip_dynaddr" }, - - { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table }, - { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table }, - { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table }, - /* NET_IPV4_FIB_HASH unused */ - { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table }, - - { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, - { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, - { NET_IPV4_TCP_SACK, "tcp_sack" }, - { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, - { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, - /* NET_IPV4_AUTOCONFIG unused */ - { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, - { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, - { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, - { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, - { NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, - /* NET_IPV4_TCP_MAX_KA_PROBES unused */ - { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, - { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, - { NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, - { NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, - { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, - /* NET_IPV4_IP_MASQ_DEBUG unused */ - { NET_TCP_SYNCOOKIES, "tcp_syncookies" }, - { NET_TCP_STDURG, "tcp_stdurg" }, - { NET_TCP_RFC1337, "tcp_rfc1337" }, - /* NET_TCP_SYN_TAILDROP unused */ - { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, - { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, - { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, - { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, - /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ - /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ - /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ - /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ - /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ - { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, - { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, - { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, - /* NET_IPV4_ALWAYS_DEFRAG unused */ - { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, - { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, - { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, - { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, - { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, - { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, - { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, - { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, - { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, - { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, - { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, - { NET_TCP_FACK, "tcp_fack" }, - { NET_TCP_REORDERING, "tcp_reordering" }, - { NET_TCP_ECN, "tcp_ecn" }, - { NET_TCP_DSACK, "tcp_dsack" }, - { NET_TCP_MEM, "tcp_mem" }, - { NET_TCP_WMEM, "tcp_wmem" }, - { NET_TCP_RMEM, "tcp_rmem" }, - { NET_TCP_APP_WIN, "tcp_app_win" }, - { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, - { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, - { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, - { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, - { NET_TCP_TW_REUSE, "tcp_tw_reuse" }, - { NET_TCP_FRTO, "tcp_frto" }, - { NET_TCP_LOW_LATENCY, "tcp_low_latency" }, - { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, - { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, - { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, - /* NET_TCP_DEFAULT_WIN_SCALE unused */ - { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, - { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, - /* NET_TCP_BIC_BETA unused */ - { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, - { NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { NET_TCP_ABC, "tcp_abc" }, - { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" }, - { NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, - { NET_TCP_BASE_MSS, "tcp_base_mss" }, - { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, - { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, - { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, - { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, - { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, - { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, - { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" }, - { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, - { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, - { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, - { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipx_table[] = { - { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, - /* NET_IPX_FORWARDING unused */ - {} -}; - -static const struct trans_ctl_table trans_net_atalk_table[] = { - { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, - { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, - { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, - { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, - {}, -}; - -static const struct trans_ctl_table trans_net_netrom_table[] = { - { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, - { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, - { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, - { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, - { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, - { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, - { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, - { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, - { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, - { NET_NETROM_ROUTING_CONTROL, "routing_control" }, - { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, - { NET_NETROM_RESET, "reset" }, - {} -}; - -static const struct trans_ctl_table trans_net_ax25_param_table[] = { - { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, - { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, - { NET_AX25_BACKOFF_TYPE, "backoff_type" }, - { NET_AX25_CONNECT_MODE, "connect_mode" }, - { NET_AX25_STANDARD_WINDOW, "standard_window_size" }, - { NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, - { NET_AX25_T1_TIMEOUT, "t1_timeout" }, - { NET_AX25_T2_TIMEOUT, "t2_timeout" }, - { NET_AX25_T3_TIMEOUT, "t3_timeout" }, - { NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, - { NET_AX25_N2, "maximum_retry_count" }, - { NET_AX25_PACLEN, "maximum_packet_length" }, - { NET_AX25_PROTOCOL, "protocol" }, - { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_ax25_table[] = { - { 0, NULL, trans_net_ax25_param_table }, - {} -}; - -static const struct trans_ctl_table trans_net_bridge_table[] = { - { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, - { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, - { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" }, - { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" }, - { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" }, - {} -}; - -static const struct trans_ctl_table trans_net_rose_table[] = { - { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, - { NET_ROSE_ROUTING_CONTROL, "routing_control" }, - { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, - { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, - { NET_ROSE_WINDOW_SIZE, "window_size" }, - { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = { - { NET_IPV6_FORWARDING, "forwarding" }, - { NET_IPV6_HOP_LIMIT, "hop_limit" }, - { NET_IPV6_MTU, "mtu" }, - { NET_IPV6_ACCEPT_RA, "accept_ra" }, - { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, - { NET_IPV6_AUTOCONF, "autoconf" }, - { NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, - { NET_IPV6_RTR_SOLICITS, "router_solicitations" }, - { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, - { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, - { NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, - { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, - { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, - { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, - { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, - { NET_IPV6_MAX_ADDRESSES, "max_addresses" }, - { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, - { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, - { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, - { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, - { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, - { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, - { NET_IPV6_PROXY_NDP, "proxy_ndp" }, - { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_conf_table[] = { - { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table }, - { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table }, - { 0, NULL, trans_net_ipv6_conf_var_table }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_route_table[] = { - { NET_IPV6_ROUTE_FLUSH, "flush" }, - { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, - { NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, - { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, - { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = { - { NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_table[] = { - { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table }, - { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table }, - { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table }, - { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table }, - { NET_IPV6_BINDV6ONLY, "bindv6only" }, - { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, - { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, - { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, - { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, - { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, - { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_x25_table[] = { - { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, - { NET_X25_FORWARD, "x25_forward" }, - {} -}; - -static const struct trans_ctl_table trans_net_tr_table[] = { - { NET_TR_RIF_TIMEOUT, "rif_timeout" }, - {} -}; - - -static const struct trans_ctl_table trans_net_decnet_conf_vars[] = { - { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, - { NET_DECNET_CONF_DEV_PRIORITY, "priority" }, - { NET_DECNET_CONF_DEV_T2, "t2" }, - { NET_DECNET_CONF_DEV_T3, "t3" }, - {} -}; - -static const struct trans_ctl_table trans_net_decnet_conf[] = { - { 0, NULL, trans_net_decnet_conf_vars }, - {} -}; - -static const struct trans_ctl_table trans_net_decnet_table[] = { - { NET_DECNET_CONF, "conf", trans_net_decnet_conf }, - { NET_DECNET_NODE_ADDRESS, "node_address" }, - { NET_DECNET_NODE_NAME, "node_name" }, - { NET_DECNET_DEFAULT_DEVICE, "default_device" }, - { NET_DECNET_TIME_WAIT, "time_wait" }, - { NET_DECNET_DN_COUNT, "dn_count" }, - { NET_DECNET_DI_COUNT, "di_count" }, - { NET_DECNET_DR_COUNT, "dr_count" }, - { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, - { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, - { NET_DECNET_MEM, "decnet_mem" }, - { NET_DECNET_RMEM, "decnet_rmem" }, - { NET_DECNET_WMEM, "decnet_wmem" }, - { NET_DECNET_DEBUG_LEVEL, "debug" }, - {} -}; - -static const struct trans_ctl_table trans_net_sctp_table[] = { - { NET_SCTP_RTO_INITIAL, "rto_initial" }, - { NET_SCTP_RTO_MIN, "rto_min" }, - { NET_SCTP_RTO_MAX, "rto_max" }, - { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, - { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, - { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, - { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, - { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, - { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, - { NET_SCTP_HB_INTERVAL, "hb_interval" }, - { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, - { NET_SCTP_MAX_BURST, "max_burst" }, - { NET_SCTP_ADDIP_ENABLE, "addip_enable" }, - { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, - { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, - { NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, - { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = { - { NET_LLC2_ACK_TIMEOUT, "ack" }, - { NET_LLC2_P_TIMEOUT, "p" }, - { NET_LLC2_REJ_TIMEOUT, "rej" }, - { NET_LLC2_BUSY_TIMEOUT, "busy" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_station_table[] = { - { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_llc2_table[] = { - { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_table[] = { - { NET_LLC2, "llc2", trans_net_llc_llc2_table }, - { NET_LLC_STATION, "station", trans_net_llc_station_table }, - {} -}; - -static const struct trans_ctl_table trans_net_netfilter_table[] = { - { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" }, - { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" }, - { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" }, - { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" }, - { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" }, - { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, - { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" }, - { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, - { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, - { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" }, - { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, - { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" }, - { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" }, - { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, - { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, - { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, - - {} -}; - -static const struct trans_ctl_table trans_net_dccp_table[] = { - { NET_DCCP_DEFAULT, "default" }, - {} -}; - -static const struct trans_ctl_table trans_net_irda_table[] = { - { NET_IRDA_DISCOVERY, "discovery" }, - { NET_IRDA_DEVNAME, "devname" }, - { NET_IRDA_DEBUG, "debug" }, - { NET_IRDA_FAST_POLL, "fast_poll_increase" }, - { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, - { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, - { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, - { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, - { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, - { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, - { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, - { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, - { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, - { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, - {} -}; - -static const struct trans_ctl_table trans_net_table[] = { - { NET_CORE, "core", trans_net_core_table }, - /* NET_ETHER not used */ - /* NET_802 not used */ - { NET_UNIX, "unix", trans_net_unix_table }, - { NET_IPV4, "ipv4", trans_net_ipv4_table }, - { NET_IPX, "ipx", trans_net_ipx_table }, - { NET_ATALK, "appletalk", trans_net_atalk_table }, - { NET_NETROM, "netrom", trans_net_netrom_table }, - { NET_AX25, "ax25", trans_net_ax25_table }, - { NET_BRIDGE, "bridge", trans_net_bridge_table }, - { NET_ROSE, "rose", trans_net_rose_table }, - { NET_IPV6, "ipv6", trans_net_ipv6_table }, - { NET_X25, "x25", trans_net_x25_table }, - { NET_TR, "token-ring", trans_net_tr_table }, - { NET_DECNET, "decnet", trans_net_decnet_table }, - /* NET_ECONET not used */ - { NET_SCTP, "sctp", trans_net_sctp_table }, - { NET_LLC, "llc", trans_net_llc_table }, - { NET_NETFILTER, "netfilter", trans_net_netfilter_table }, - { NET_DCCP, "dccp", trans_net_dccp_table }, - { NET_IRDA, "irda", trans_net_irda_table }, - { 2089, "nf_conntrack_max" }, - {} -}; - -static const struct trans_ctl_table trans_fs_quota_table[] = { - { FS_DQ_LOOKUPS, "lookups" }, - { FS_DQ_DROPS, "drops" }, - { FS_DQ_READS, "reads" }, - { FS_DQ_WRITES, "writes" }, - { FS_DQ_CACHE_HITS, "cache_hits" }, - { FS_DQ_ALLOCATED, "allocated_dquots" }, - { FS_DQ_FREE, "free_dquots" }, - { FS_DQ_SYNCS, "syncs" }, - { FS_DQ_WARNINGS, "warnings" }, - {} -}; - -static const struct trans_ctl_table trans_fs_xfs_table[] = { - { XFS_SGID_INHERIT, "irix_sgid_inherit" }, - { XFS_SYMLINK_MODE, "irix_symlink_mode" }, - { XFS_PANIC_MASK, "panic_mask" }, - - { XFS_ERRLEVEL, "error_level" }, - { XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, - { XFS_INHERIT_SYNC, "inherit_sync" }, - { XFS_INHERIT_NODUMP, "inherit_nodump" }, - { XFS_INHERIT_NOATIME, "inherit_noatime" }, - { XFS_BUF_TIMER, "xfsbufd_centisecs" }, - { XFS_BUF_AGE, "age_buffer_centisecs" }, - { XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, - { XFS_ROTORSTEP, "rotorstep" }, - { XFS_INHERIT_NODFRG, "inherit_nodefrag" }, - { XFS_FILESTREAM_TIMER, "filestream_centisecs" }, - { XFS_STATS_CLEAR, "stats_clear" }, - {} -}; - -static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = { - { 1, "hb_ctl_path" }, - {} -}; - -static const struct trans_ctl_table trans_fs_ocfs2_table[] = { - { 1, "nm", trans_fs_ocfs2_nm_table }, - {} -}; - -static const struct trans_ctl_table trans_inotify_table[] = { - { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, - { INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, - { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, - {} -}; - -static const struct trans_ctl_table trans_fs_table[] = { - { FS_NRINODE, "inode-nr" }, - { FS_STATINODE, "inode-state" }, - /* FS_MAXINODE unused */ - /* FS_NRDQUOT unused */ - /* FS_MAXDQUOT unused */ - { FS_NRFILE, "file-nr" }, - { FS_MAXFILE, "file-max" }, - { FS_DENTRY, "dentry-state" }, - /* FS_NRSUPER unused */ - /* FS_MAXUPSER unused */ - { FS_OVERFLOWUID, "overflowuid" }, - { FS_OVERFLOWGID, "overflowgid" }, - { FS_LEASES, "leases-enable" }, - { FS_DIR_NOTIFY, "dir-notify-enable" }, - { FS_LEASE_TIME, "lease-break-time" }, - { FS_DQSTATS, "quota", trans_fs_quota_table }, - { FS_XFS, "xfs", trans_fs_xfs_table }, - { FS_AIO_NR, "aio-nr" }, - { FS_AIO_MAX_NR, "aio-max-nr" }, - { FS_INOTIFY, "inotify", trans_inotify_table }, - { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table }, - { KERN_SETUID_DUMPABLE, "suid_dumpable" }, - {} -}; - -static const struct trans_ctl_table trans_debug_table[] = { - {} -}; - -static const struct trans_ctl_table trans_cdrom_table[] = { - { DEV_CDROM_INFO, "info" }, - { DEV_CDROM_AUTOCLOSE, "autoclose" }, - { DEV_CDROM_AUTOEJECT, "autoeject" }, - { DEV_CDROM_DEBUG, "debug" }, - { DEV_CDROM_LOCK, "lock" }, - { DEV_CDROM_CHECK_MEDIA, "check_media" }, - {} -}; - -static const struct trans_ctl_table trans_ipmi_table[] = { - { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, - {} -}; - -static const struct trans_ctl_table trans_mac_hid_files[] = { - /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ - /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ - { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, - { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, - { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, - /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ - {} -}; - -static const struct trans_ctl_table trans_raid_table[] = { - { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, - { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, - {} -}; - -static const struct trans_ctl_table trans_scsi_table[] = { - { DEV_SCSI_LOGGING_LEVEL, "logging_level" }, - {} -}; - -static const struct trans_ctl_table trans_parport_default_table[] = { - { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" }, - { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" }, - {} -}; - -static const struct trans_ctl_table trans_parport_device_table[] = { - { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" }, - {} -}; - -static const struct trans_ctl_table trans_parport_devices_table[] = { - { DEV_PARPORT_DEVICES_ACTIVE, "active" }, - { 0, NULL, trans_parport_device_table }, - {} -}; - -static const struct trans_ctl_table trans_parport_parport_table[] = { - { DEV_PARPORT_SPINTIME, "spintime" }, - { DEV_PARPORT_BASE_ADDR, "base-addr" }, - { DEV_PARPORT_IRQ, "irq" }, - { DEV_PARPORT_DMA, "dma" }, - { DEV_PARPORT_MODES, "modes" }, - { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table }, - { DEV_PARPORT_AUTOPROBE, "autoprobe" }, - { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" }, - { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" }, - { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" }, - { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" }, - {} -}; -static const struct trans_ctl_table trans_parport_table[] = { - { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table }, - { 0, NULL, trans_parport_parport_table }, - {} -}; - -static const struct trans_ctl_table trans_dev_table[] = { - { DEV_CDROM, "cdrom", trans_cdrom_table }, - /* DEV_HWMON unused */ - { DEV_PARPORT, "parport", trans_parport_table }, - { DEV_RAID, "raid", trans_raid_table }, - { DEV_MAC_HID, "mac_hid", trans_mac_hid_files }, - { DEV_SCSI, "scsi", trans_scsi_table }, - { DEV_IPMI, "ipmi", trans_ipmi_table }, - {} -}; - -static const struct trans_ctl_table trans_bus_isa_table[] = { - { BUS_ISA_MEM_BASE, "membase" }, - { BUS_ISA_PORT_BASE, "portbase" }, - { BUS_ISA_PORT_SHIFT, "portshift" }, - {} -}; - -static const struct trans_ctl_table trans_bus_table[] = { - { CTL_BUS_ISA, "isa", trans_bus_isa_table }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table0[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan0-txRing" }, - { 151, "arlan0-rxRing" }, - { 152, "arlan0-18" }, - { 153, "arlan0-ring" }, - { 154, "arlan0-shm-cpy" }, - { 155, "config0" }, - { 156, "reset0" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table1[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan1-txRing" }, - { 151, "arlan1-rxRing" }, - { 152, "arlan1-18" }, - { 153, "arlan1-ring" }, - { 154, "arlan1-shm-cpy" }, - { 155, "config1" }, - { 156, "reset1" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table2[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan2-txRing" }, - { 151, "arlan2-rxRing" }, - { 152, "arlan2-18" }, - { 153, "arlan2-ring" }, - { 154, "arlan2-shm-cpy" }, - { 155, "config2" }, - { 156, "reset2" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table3[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan3-txRing" }, - { 151, "arlan3-rxRing" }, - { 152, "arlan3-18" }, - { 153, "arlan3-ring" }, - { 154, "arlan3-shm-cpy" }, - { 155, "config3" }, - { 156, "reset3" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_table[] = { - { 1, "arlan0", trans_arlan_conf_table0 }, - { 2, "arlan1", trans_arlan_conf_table1 }, - { 3, "arlan2", trans_arlan_conf_table2 }, - { 4, "arlan3", trans_arlan_conf_table3 }, - {} -}; - -static const struct trans_ctl_table trans_s390dbf_table[] = { - { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, - { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, - {} -}; - -static const struct trans_ctl_table trans_sunrpc_table[] = { - { CTL_RPCDEBUG, "rpc_debug" }, - { CTL_NFSDEBUG, "nfs_debug" }, - { CTL_NFSDDEBUG, "nfsd_debug" }, - { CTL_NLMDEBUG, "nlm_debug" }, - { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, - { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, - { CTL_MIN_RESVPORT, "min_resvport" }, - { CTL_MAX_RESVPORT, "max_resvport" }, - {} -}; - -static const struct trans_ctl_table trans_pm_table[] = { - { 1 /* CTL_PM_SUSPEND */, "suspend" }, - { 2 /* CTL_PM_CMODE */, "cmode" }, - { 3 /* CTL_PM_P0 */, "p0" }, - { 4 /* CTL_PM_CM */, "cm" }, - {} -}; - -static const struct trans_ctl_table trans_frv_table[] = { - { 1, "cache-mode" }, - { 2, "pin-cxnr" }, - {} -}; - -static const struct trans_ctl_table trans_root_table[] = { - { CTL_KERN, "kernel", trans_kern_table }, - { CTL_VM, "vm", trans_vm_table }, - { CTL_NET, "net", trans_net_table }, - /* CTL_PROC not used */ - { CTL_FS, "fs", trans_fs_table }, - { CTL_DEBUG, "debug", trans_debug_table }, - { CTL_DEV, "dev", trans_dev_table }, - { CTL_BUS, "bus", trans_bus_table }, - { CTL_ABI, "abi" }, - /* CTL_CPU not used */ - { CTL_ARLAN, "arlan", trans_arlan_table }, - { CTL_S390DBF, "s390dbf", trans_s390dbf_table }, - { CTL_SUNRPC, "sunrpc", trans_sunrpc_table }, - { CTL_PM, "pm", trans_pm_table }, - { CTL_FRV, "frv", trans_frv_table }, - {} -}; - - - static int sysctl_depth(struct ctl_table *table) { @@ -1262,47 +28,6 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) return table; } -static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table) -{ - struct ctl_table *test; - const struct trans_ctl_table *ref; - int cur_depth; - - cur_depth = sysctl_depth(table); - - ref = trans_root_table; -repeat: - test = sysctl_parent(table, cur_depth); - for (; ref->ctl_name || ref->procname || ref->child; ref++) { - int match = 0; - - if (cur_depth && !ref->child) - continue; - - if (test->procname && ref->procname && - (strcmp(test->procname, ref->procname) == 0)) - match++; - - if (test->ctl_name && ref->ctl_name && - (test->ctl_name == ref->ctl_name)) - match++; - - if (!ref->ctl_name && !ref->procname) - match++; - - if (match) { - if (cur_depth != 0) { - cur_depth--; - ref = ref->child; - goto repeat; - } - goto out; - } - } - ref = NULL; -out: - return ref; -} static void sysctl_print_path(struct ctl_table *table) { @@ -1316,26 +41,6 @@ static void sysctl_print_path(struct ctl_table *table) } } printk(" "); - if (table->ctl_name) { - for (i = depth; i >= 0; i--) { - tmp = sysctl_parent(table, i); - printk(".%d", tmp->ctl_name); - } - } -} - -static void sysctl_repair_table(struct ctl_table *table) -{ - /* Don't complain about the classic default - * sysctl strategy routine. Maybe later we - * can get the tables fixed and complain about - * this. - */ - if (table->ctl_name && table->procname && - (table->proc_handler == proc_dointvec) && - (!table->strategy)) { - table->strategy = sysctl_data; - } } static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, @@ -1353,7 +58,7 @@ static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, ref = head->ctl_table; repeat: test = sysctl_parent(table, cur_depth); - for (; ref->ctl_name || ref->procname; ref++) { + for (; ref->procname; ref++) { int match = 0; if (cur_depth && !ref->child) continue; @@ -1362,10 +67,6 @@ repeat: (strcmp(test->procname, ref->procname) == 0)) match++; - if (test->ctl_name && ref->ctl_name && - (test->ctl_name == ref->ctl_name)) - match++; - if (match) { if (cur_depth != 0) { cur_depth--; @@ -1393,38 +94,6 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str *fail = str; } -static int sysctl_check_dir(struct nsproxy *namespaces, - struct ctl_table *table) -{ - struct ctl_table *ref; - int error; - - error = 0; - ref = sysctl_check_lookup(namespaces, table); - if (ref) { - int match = 0; - if ((!table->procname && !ref->procname) || - (table->procname && ref->procname && - (strcmp(table->procname, ref->procname) == 0))) - match++; - - if ((!table->ctl_name && !ref->ctl_name) || - (table->ctl_name && ref->ctl_name && - (table->ctl_name == ref->ctl_name))) - match++; - - if (match != 2) { - printk(KERN_ERR "%s: failed: ", __func__); - sysctl_print_path(table); - printk(" ref: "); - sysctl_print_path(ref); - printk("\n"); - error = -EINVAL; - } - } - return error; -} - static void sysctl_check_leaf(struct nsproxy *namespaces, struct ctl_table *table, const char **fail) { @@ -1435,37 +104,15 @@ static void sysctl_check_leaf(struct nsproxy *namespaces, set_fail(fail, table, "Sysctl already exists"); } -static void sysctl_check_bin_path(struct ctl_table *table, const char **fail) -{ - const struct trans_ctl_table *ref; - - ref = sysctl_binary_lookup(table); - if (table->ctl_name && !ref) - set_fail(fail, table, "Unknown sysctl binary path"); - if (ref) { - if (ref->procname && - (!table->procname || - (strcmp(table->procname, ref->procname) != 0))) - set_fail(fail, table, "procname does not match binary path procname"); - - if (ref->ctl_name && table->ctl_name && - (table->ctl_name != ref->ctl_name)) - set_fail(fail, table, "ctl_name does not match binary path ctl_name"); - } -} - int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) { int error = 0; - for (; table->ctl_name || table->procname; table++) { + for (; table->procname; table++) { const char *fail = NULL; - sysctl_repair_table(table); if (table->parent) { if (table->procname && !table->parent->procname) set_fail(&fail, table, "Parent without procname"); - if (table->ctl_name && !table->parent->ctl_name) - set_fail(&fail, table, "Parent without ctl_name"); } if (!table->procname) set_fail(&fail, table, "No procname"); @@ -1478,21 +125,12 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "Writable sysctl directory"); if (table->proc_handler) set_fail(&fail, table, "Directory with proc_handler"); - if (table->strategy) - set_fail(&fail, table, "Directory with strategy"); if (table->extra1) set_fail(&fail, table, "Directory with extra1"); if (table->extra2) set_fail(&fail, table, "Directory with extra2"); - if (sysctl_check_dir(namespaces, table)) - set_fail(&fail, table, "Inconsistent directory names"); } else { - if ((table->strategy == sysctl_data) || - (table->strategy == sysctl_string) || - (table->strategy == sysctl_intvec) || - (table->strategy == sysctl_jiffies) || - (table->strategy == sysctl_ms_jiffies) || - (table->proc_handler == proc_dostring) || + if ((table->proc_handler == proc_dostring) || (table->proc_handler == proc_dointvec) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || @@ -1514,14 +152,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "No max"); } } -#ifdef CONFIG_SYSCTL_SYSCALL - if (table->ctl_name && !table->strategy) - set_fail(&fail, table, "Missing strategy"); -#endif -#if 0 - if (!table->ctl_name && table->strategy) - set_fail(&fail, table, "Strategy without ctl_name"); -#endif #ifdef CONFIG_PROC_SYSCTL if (table->procname && !table->proc_handler) set_fail(&fail, table, "No proc_handler"); @@ -1532,7 +162,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) #endif sysctl_check_leaf(namespaces, table, &fail); } - sysctl_check_bin_path(table, &fail); if (table->mode > 0777) set_fail(&fail, table, "bogus .mode"); if (fail) { diff --git a/kernel/time.c b/kernel/time.c index 2e2e469a7fe..804798005d1 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -662,6 +662,36 @@ u64 nsec_to_clock_t(u64 x) #endif } +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) +{ +#if (NSEC_PER_SEC % HZ) == 0 + /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ + return div_u64(n, NSEC_PER_SEC / HZ); +#elif (HZ % 512) == 0 + /* overflow after 292 years if HZ = 1024 */ + return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * Generic case - optimized for cases where HZ is a multiple of 3. + * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. + */ + return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); +#endif +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b416512ad17..d006554888d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -339,6 +339,27 @@ config POWER_TRACER power management decisions, specifically the C-state and P-state behavior. +config KSYM_TRACER + bool "Trace read and write access on kernel memory locations" + depends on HAVE_HW_BREAKPOINT + select TRACING + help + This tracer helps find read and write operations on any given kernel + symbol i.e. /proc/kallsyms. + +config PROFILE_KSYM_TRACER + bool "Profile all kernel memory accesses on 'watched' variables" + depends on KSYM_TRACER + help + This tracer profiles kernel accesses on variables watched through the + ksym tracer ftrace plugin. Depending upon the hardware, all read + and write operations on kernel variables can be monitored for + accesses. + + The results will be displayed in: + /debugfs/tracing/profile_ksym + + Say N if unsure. config STACK_TRACER bool "Trace max stack" @@ -428,6 +449,23 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config KPROBE_EVENT + depends on KPROBES + depends on X86 + bool "Enable kprobes-based dynamic events" + select TRACING + default y + help + This allows the user to add tracing events (similar to tracepoints) on the fly + via the ftrace interface. See Documentation/trace/kprobetrace.txt + for more details. + + Those events can be inserted wherever kprobes can probe, and record + various register and memory values. + + This option is also required by perf-probe subcommand of perf tools. If + you want to use perf tools, this option is strongly recommended. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 26f03ac07c2..cd9ecd89ec7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a72c6e03dee..a1ca4956ab5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s) int ret; ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\n", - (unsigned int)sizeof(field.time_stamp)); + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); ret = trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit)); + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); ret = trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE); + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); return ret; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index acef8b4636f..1d7f4830a80 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include <linux/ftrace.h> #include <trace/boot.h> #include <linux/kmemtrace.h> +#include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -37,6 +38,7 @@ enum trace_type { TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_BLK, + TRACE_KSYM, __TRACE_LAST_TYPE, }; @@ -98,9 +100,32 @@ struct syscall_trace_enter { struct syscall_trace_exit { struct trace_entry ent; int nr; - unsigned long ret; + long ret; }; +struct kprobe_trace_entry { + struct trace_entry ent; + unsigned long ip; + int nargs; + unsigned long args[]; +}; + +#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ + (offsetof(struct kprobe_trace_entry, args) + \ + (sizeof(unsigned long) * (n))) + +struct kretprobe_trace_entry { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; + int nargs; + unsigned long args[]; +}; + +#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \ + (offsetof(struct kretprobe_trace_entry, args) + \ + (sizeof(unsigned long) * (n))) + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void); TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ TRACE_KMEM_FREE); \ + IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) @@ -364,6 +390,8 @@ int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); +extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); + extern unsigned long nsecs_to_usecs(unsigned long nsecs); #ifdef CONFIG_TRACER_MAX_TRACE @@ -438,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_ksym(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -683,7 +713,6 @@ struct event_filter { int n_preds; struct filter_pred **preds; char *filter_string; - bool no_reset; }; struct event_subsystem { @@ -703,7 +732,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, typedef int (*regex_match_func)(char *str, struct regex *r, int len); enum regex_type { - MATCH_FULL, + MATCH_FULL = 0, MATCH_FRONT_ONLY, MATCH_MIDDLE_ONLY, MATCH_END_ONLY, @@ -744,7 +773,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { + if (unlikely(call->filter_active) && + !filter_match_preds(call->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ead3d724599..c16a08f399d 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, F_printk("type:%u call_site:%lx ptr:%p", __entry->type_id, __entry->call_site, __entry->ptr) ); + +FTRACE_ENTRY(ksym_trace, ksym_trace_entry, + + TRACE_KSYM, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned char, type ) + __array( char , cmd, TASK_COMM_LEN ) + __field( unsigned long, addr ) + ), + + F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", + (void *)__entry->ip, (unsigned int)__entry->type, + (void *)__entry->addr, __entry->cmd) +); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 8d5c171cc99..d9c60f80aa0 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -8,17 +8,14 @@ #include <linux/module.h> #include "trace.h" -/* - * We can't use a size but a type in alloc_percpu() - * So let's create a dummy type that matches the desired size - */ -typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; -char *trace_profile_buf; -EXPORT_SYMBOL_GPL(trace_profile_buf); +char *perf_trace_buf; +EXPORT_SYMBOL_GPL(perf_trace_buf); + +char *perf_trace_buf_nmi; +EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); -char *trace_profile_buf_nmi; -EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); +typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; /* Count the events in use (per event id, not per instance) */ static int total_profile_count; @@ -32,20 +29,20 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) return 0; if (!total_profile_count) { - buf = (char *)alloc_percpu(profile_buf_t); + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf; - rcu_assign_pointer(trace_profile_buf, buf); + rcu_assign_pointer(perf_trace_buf, buf); - buf = (char *)alloc_percpu(profile_buf_t); + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, buf); + rcu_assign_pointer(perf_trace_buf_nmi, buf); } - ret = event->profile_enable(); + ret = event->profile_enable(event); if (!ret) { total_profile_count++; return 0; @@ -53,10 +50,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) fail_buf_nmi: if (!total_profile_count) { - free_percpu(trace_profile_buf_nmi); - free_percpu(trace_profile_buf); - trace_profile_buf_nmi = NULL; - trace_profile_buf = NULL; + free_percpu(perf_trace_buf_nmi); + free_percpu(perf_trace_buf); + perf_trace_buf_nmi = NULL; + perf_trace_buf = NULL; } fail_buf: atomic_dec(&event->profile_count); @@ -89,14 +86,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event) if (!atomic_add_negative(-1, &event->profile_count)) return; - event->profile_disable(); + event->profile_disable(event); if (!--total_profile_count) { - buf = trace_profile_buf; - rcu_assign_pointer(trace_profile_buf, NULL); + buf = perf_trace_buf; + rcu_assign_pointer(perf_trace_buf, NULL); - nmi_buf = trace_profile_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, NULL); + nmi_buf = perf_trace_buf_nmi; + rcu_assign_pointer(perf_trace_buf_nmi, NULL); /* * Ensure every events in profiling have finished before diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5e9ffc33f6d..1d18315dc83 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_define_common_fields); -#ifdef CONFIG_MODULES - -static void trace_destroy_fields(struct ftrace_event_call *call) +void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; @@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call) } } -#endif /* CONFIG_MODULES */ - static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { @@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, if (call->enabled) { call->enabled = 0; tracing_stop_cmdline_record(); - call->unregfunc(call->data); + call->unregfunc(call); } break; case 1: if (!call->enabled) { call->enabled = 1; tracing_start_cmdline_record(); - call->regfunc(call->data); + call->regfunc(call); } break; } @@ -507,7 +503,7 @@ extern char *__bad_type_size(void); #define FIELD(type, name) \ sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name) + sizeof(field.name), is_signed_type(type) static int trace_write_header(struct trace_seq *s) { @@ -515,17 +511,17 @@ static int trace_write_header(struct trace_seq *s) /* struct trace_entry */ return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\n", - FIELD(unsigned short, type), - FIELD(unsigned char, flags), - FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, lock_depth)); + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\n", + FIELD(unsigned short, type), + FIELD(unsigned char, flags), + FIELD(unsigned char, preempt_count), + FIELD(int, pid), + FIELD(int, lock_depth)); } static ssize_t @@ -937,27 +933,46 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return 0; } -#define for_each_event(event, start, end) \ - for (event = start; \ - (unsigned long)event < (unsigned long)end; \ - event++) +static int __trace_add_event_call(struct ftrace_event_call *call) +{ + struct dentry *d_events; + int ret; -#ifdef CONFIG_MODULES + if (!call->name) + return -EINVAL; -static LIST_HEAD(ftrace_module_file_list); + if (call->raw_init) { + ret = call->raw_init(call); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace " + "events/%s\n", call->name); + return ret; + } + } -/* - * Modules must own their file_operations to keep up with - * reference counting. - */ -struct ftrace_module_file_ops { - struct list_head list; - struct module *mod; - struct file_operations id; - struct file_operations enable; - struct file_operations format; - struct file_operations filter; -}; + d_events = event_trace_events_dir(); + if (!d_events) + return -ENOENT; + + ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (!ret) + list_add(&call->list, &ftrace_events); + + return ret; +} + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct ftrace_event_call *call) +{ + int ret; + mutex_lock(&event_mutex); + ret = __trace_add_event_call(call); + mutex_unlock(&event_mutex); + return ret; +} static void remove_subsystem_dir(const char *name) { @@ -985,6 +1000,53 @@ static void remove_subsystem_dir(const char *name) } } +/* + * Must be called under locking both of event_mutex and trace_event_mutex. + */ +static void __trace_remove_event_call(struct ftrace_event_call *call) +{ + ftrace_event_enable_disable(call, 0); + if (call->event) + __unregister_ftrace_event(call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); + remove_subsystem_dir(call->system); +} + +/* Remove an event_call */ +void trace_remove_event_call(struct ftrace_event_call *call) +{ + mutex_lock(&event_mutex); + down_write(&trace_event_mutex); + __trace_remove_event_call(call); + up_write(&trace_event_mutex); + mutex_unlock(&event_mutex); +} + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +#ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { @@ -1042,7 +1104,7 @@ static void trace_module_add_events(struct module *mod) if (!call->name) continue; if (call->raw_init) { - ret = call->raw_init(); + ret = call->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1060,10 +1122,11 @@ static void trace_module_add_events(struct module *mod) return; } call->mod = mod; - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + ret = event_create_dir(call, d_events, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); + if (!ret) + list_add(&call->list, &ftrace_events); } } @@ -1077,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod) list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; - ftrace_event_enable_disable(call, 0); - if (call->event) - __unregister_ftrace_event(call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); - trace_destroy_fields(call); - destroy_preds(call); - remove_subsystem_dir(call->system); + __trace_remove_event_call(call); } } @@ -1202,7 +1258,7 @@ static __init int event_trace_init(void) if (!call->name) continue; if (call->raw_init) { - ret = call->raw_init(); + ret = call->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1210,10 +1266,12 @@ static __init int event_trace_init(void) continue; } } - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, &ftrace_event_id_fops, - &ftrace_enable_fops, &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (!ret) + list_add(&call->list, &ftrace_events); } while (true) { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 92672016da2..50504cb228d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> +#include <linux/perf_event.h> #include "trace.h" #include "trace_output.h" @@ -29,6 +30,7 @@ enum filter_op_ids { OP_OR, OP_AND, + OP_GLOB, OP_NE, OP_EQ, OP_LT, @@ -46,16 +48,17 @@ struct filter_op { }; static struct filter_op filter_ops[] = { - { OP_OR, "||", 1 }, - { OP_AND, "&&", 2 }, - { OP_NE, "!=", 4 }, - { OP_EQ, "==", 4 }, - { OP_LT, "<", 5 }, - { OP_LE, "<=", 5 }, - { OP_GT, ">", 5 }, - { OP_GE, ">=", 5 }, - { OP_NONE, "OP_NONE", 0 }, - { OP_OPEN_PAREN, "(", 0 }, + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_GLOB, "~", 4 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, }; enum { @@ -329,22 +332,18 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) return type; } -static int filter_build_regex(struct filter_pred *pred) +static void filter_build_regex(struct filter_pred *pred) { struct regex *r = &pred->regex; - char *search, *dup; - enum regex_type type; - int not; - - type = filter_parse_regex(r->pattern, r->len, &search, ¬); - dup = kstrdup(search, GFP_KERNEL); - if (!dup) - return -ENOMEM; - - strcpy(r->pattern, dup); - kfree(dup); - - r->len = strlen(r->pattern); + char *search; + enum regex_type type = MATCH_FULL; + int not = 0; + + if (pred->op == OP_GLOB) { + type = filter_parse_regex(r->pattern, r->len, &search, ¬); + r->len = strlen(search); + memmove(r->pattern, search, r->len+1); + } switch (type) { case MATCH_FULL: @@ -362,14 +361,11 @@ static int filter_build_regex(struct filter_pred *pred) } pred->not ^= not; - - return 0; } /* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct ftrace_event_call *call, void *rec) +int filter_match_preds(struct event_filter *filter, void *rec) { - struct event_filter *filter = call->filter; int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; @@ -542,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } -void destroy_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; if (!filter) @@ -557,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call) kfree(filter->preds); kfree(filter->filter_string); kfree(filter); +} + +void destroy_preds(struct ftrace_event_call *call) +{ + __free_preds(call->filter); call->filter = NULL; + call->filter_active = 0; } -static int init_preds(struct ftrace_event_call *call) +static struct event_filter *__alloc_preds(void) { struct event_filter *filter; struct filter_pred *pred; int i; - if (call->filter) - return 0; - - filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!call->filter) - return -ENOMEM; + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); filter->n_preds = 0; @@ -587,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call) filter->preds[i] = pred; } - return 0; + return filter; oom: - destroy_preds(call); + __free_preds(filter); + return ERR_PTR(-ENOMEM); +} + +static int init_preds(struct ftrace_event_call *call) +{ + if (call->filter) + return 0; - return -ENOMEM; + call->filter_active = 0; + call->filter = __alloc_preds(); + if (IS_ERR(call->filter)) + return PTR_ERR(call->filter); + + return 0; } static int init_subsystem_preds(struct event_subsystem *system) @@ -615,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system) return 0; } -enum { - FILTER_DISABLE_ALL, - FILTER_INIT_NO_RESET, - FILTER_SKIP_NO_RESET, -}; - -static void filter_free_subsystem_preds(struct event_subsystem *system, - int flag) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; @@ -633,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, if (strcmp(call->system, system->name) != 0) continue; - if (flag == FILTER_INIT_NO_RESET) { - call->filter->no_reset = false; - continue; - } - - if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) - continue; - filter_disable_preds(call); remove_filter_string(call->filter); } @@ -648,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, filter_pred_fn_t fn) { - struct event_filter *filter = call->filter; int idx, err; if (filter->n_preds == MAX_FILTER_PRED) { @@ -666,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return err; filter->n_preds++; - call->filter_active = 1; return 0; } @@ -691,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field) static int is_legal_op(struct ftrace_event_field *field, int op) { - if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) + if (is_string_field(field) && + (op != OP_EQ && op != OP_NE && op != OP_GLOB)) + return 0; + if (!is_string_field(field) && op == OP_GLOB) return 0; return 1; @@ -742,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, bool dry_run) { @@ -776,15 +774,13 @@ static int filter_add_pred(struct filter_parse_state *ps, } if (is_string_field(field)) { - ret = filter_build_regex(pred); - if (ret) - return ret; + filter_build_regex(pred); if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) - fn = filter_pred_strloc; + fn = filter_pred_strloc; else { fn = filter_pred_pchar; pred->regex.field_len = strlen(pred->regex.pattern); @@ -813,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, pred, fn); - return 0; -} - -static int filter_add_subsystem_pred(struct filter_parse_state *ps, - struct event_subsystem *system, - struct filter_pred *pred, - char *filter_string, - bool dry_run) -{ - struct ftrace_event_call *call; - int err = 0; - bool fail = true; - - list_for_each_entry(call, &ftrace_events, list) { - - if (!call->define_fields) - continue; - - if (strcmp(call->system, system->name)) - continue; - - if (call->filter->no_reset) - continue; - - err = filter_add_pred(ps, call, pred, dry_run); - if (err) - call->filter->no_reset = true; - else - fail = false; - - if (!dry_run) - replace_filter_string(call->filter, filter_string); - } - - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return err; - } + return filter_add_pred_fn(ps, call, filter, pred, fn); return 0; } @@ -1209,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps) return 0; } -static int replace_preds(struct event_subsystem *system, - struct ftrace_event_call *call, +static int replace_preds(struct ftrace_event_call *call, + struct event_filter *filter, struct filter_parse_state *ps, char *filter_string, bool dry_run) @@ -1257,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system, add_pred: if (!pred) return -ENOMEM; - if (call) - err = filter_add_pred(ps, call, pred, false); - else - err = filter_add_subsystem_pred(ps, system, pred, - filter_string, dry_run); + err = filter_add_pred(ps, call, filter, pred, dry_run); filter_free_pred(pred); if (err) return err; @@ -1272,10 +1226,50 @@ add_pred: return 0; } -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +static int replace_system_preds(struct event_subsystem *system, + struct filter_parse_state *ps, + char *filter_string) { + struct ftrace_event_call *call; + bool fail = true; int err; + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (!call->define_fields) + continue; + + if (strcmp(call->system, system->name) != 0) + continue; + + /* try to see if the filter can be applied */ + err = replace_preds(call, filter, ps, filter_string, true); + if (err) + continue; + + /* really apply the filter */ + filter_disable_preds(call); + err = replace_preds(call, filter, ps, filter_string, false); + if (err) + filter_disable_preds(call); + else { + call->filter_active = 1; + replace_filter_string(filter, filter_string); + } + fail = false; + } + + if (fail) { + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + } + return 0; +} + +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1287,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1306,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(NULL, call, ps, filter_string, false); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); - + else + call->filter_active = 1; out: filter_opstack_clear(ps); postfix_clear(ps); @@ -1324,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { int err; - struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1334,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out_unlock; if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); + filter_free_subsystem_preds(system); remove_filter_string(system->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1354,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out; } - filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); - - /* try to see the filter can be applied to which events */ - err = replace_preds(system, NULL, ps, filter_string, true); - if (err) { + err = replace_system_preds(system, ps, filter_string); + if (err) append_filter_err(ps, system->filter); - goto out; + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#ifdef CONFIG_EVENT_PROFILE + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_preds(filter); +} + +int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str) +{ + int err; + struct event_filter *filter; + struct filter_parse_state *ps; + struct ftrace_event_call *call = NULL; + + mutex_lock(&event_mutex); + + list_for_each_entry(call, &ftrace_events, list) { + if (call->id == event_id) + break; } - filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); + err = -EINVAL; + if (!call) + goto out_unlock; - /* really apply the filter to the events */ - err = replace_preds(system, NULL, ps, filter_string, false); - if (err) { - append_filter_err(ps, system->filter); - filter_free_subsystem_preds(system, 2); + err = -EEXIST; + if (event->filter) + goto out_unlock; + + filter = __alloc_preds(); + if (IS_ERR(filter)) { + err = PTR_ERR(filter); + goto out_unlock; } -out: + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_preds; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + event->filter = filter; + +free_ps: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); + +free_preds: + if (err) + __free_preds(filter); + out_unlock: mutex_unlock(&event_mutex); return err; } +#endif /* CONFIG_EVENT_PROFILE */ + diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index c74848ddb85..dff8c84ddf1 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -66,44 +66,47 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __field #define __field(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; #undef __field_desc #define __field_desc(type, container, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), container.item), \ - sizeof(field.container.item)); \ + sizeof(field.container.item), \ + is_signed_type(type)); \ if (!ret) \ return 0; #undef __array #define __array(type, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; #undef __array_desc #define __array_desc(type, container, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), container.item), \ - sizeof(field.container.item)); \ + sizeof(field.container.item), \ + is_signed_type(type)); \ if (!ret) \ return 0; #undef __dynamic_array #define __dynamic_array(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:0;\n", \ - offsetof(typeof(field), item)); \ + "offset:%zu;\tsize:0;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + is_signed_type(type)); \ if (!ret) \ return 0; @@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ #include "trace_entries.h" - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ @@ -193,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #include "trace_entries.h" +static int ftrace_raw_init_event(struct ftrace_event_call *call) +{ + INIT_LIST_HEAD(&call->fields); + return 0; +} #undef __field #define __field(type, item) @@ -211,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ -static int ftrace_raw_init_event_##call(void); \ \ struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ @@ -219,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = type, \ .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ + .raw_init = ftrace_raw_init_event, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - INIT_LIST_HEAD(&event_##call.fields); \ - return 0; \ -} \ #include "trace_entries.h" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 00000000000..aff5f80b59b --- /dev/null +++ b/kernel/trace/trace_kprobe.c @@ -0,0 +1,1523 @@ +/* + * Kprobes-based tracing events + * + * Created by Masami Hiramatsu <mhiramat@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/kprobes.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/debugfs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> +#include <linux/perf_event.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS 128 +#define MAX_ARGSTR_LEN 63 +#define MAX_EVENT_NAME_LEN 64 +#define KPROBE_EVENT_SYSTEM "kprobes" + +/* Reserved field names */ +#define FIELD_STRING_IP "__probe_ip" +#define FIELD_STRING_NARGS "__probe_nargs" +#define FIELD_STRING_RETIP "__probe_ret_ip" +#define FIELD_STRING_FUNC "__probe_func" + +const char *reserved_field_names[] = { + "common_type", + "common_flags", + "common_preempt_count", + "common_pid", + "common_tgid", + "common_lock_depth", + FIELD_STRING_IP, + FIELD_STRING_NARGS, + FIELD_STRING_RETIP, + FIELD_STRING_FUNC, +}; + +struct fetch_func { + unsigned long (*func)(struct pt_regs *, void *); + void *data; +}; + +static __kprobes unsigned long call_fetch(struct fetch_func *f, + struct pt_regs *regs) +{ + return f->func(regs, f->data); +} + +/* fetch handlers */ +static __kprobes unsigned long fetch_register(struct pt_regs *regs, + void *offset) +{ + return regs_get_register(regs, (unsigned int)((unsigned long)offset)); +} + +static __kprobes unsigned long fetch_stack(struct pt_regs *regs, + void *num) +{ + return regs_get_kernel_stack_nth(regs, + (unsigned int)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) +{ + unsigned long retval; + + if (probe_kernel_address(addr, retval)) + return 0; + return retval; +} + +static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num) +{ + return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, + void *dummy) +{ + return regs_return_value(regs); +} + +static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, + void *dummy) +{ + return kernel_stack_pointer(regs); +} + +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +static unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + if (sc->addr) + sc->addr += sc->offset; + return sc->addr; +} + +static void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + + update_symbol_cache(sc); + return sc; +} + +static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) +{ + struct symbol_cache *sc = data; + + if (sc->addr) + return fetch_memory(regs, (void *)sc->addr); + else + return 0; +} + +/* Special indirect memory access interface */ +struct indirect_fetch_data { + struct fetch_func orig; + long offset; +}; + +static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) +{ + struct indirect_fetch_data *ind = data; + unsigned long addr; + + addr = call_fetch(&ind->orig, regs); + if (addr) { + addr += ind->offset; + return fetch_memory(regs, (void *)addr); + } else + return 0; +} + +static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +{ + if (data->orig.func == fetch_indirect) + free_indirect_fetch_data(data->orig.data); + else if (data->orig.func == fetch_symbol) + free_symbol_cache(data->orig.data); + kfree(data); +} + +/** + * Kprobe event core functions + */ + +struct probe_arg { + struct fetch_func fetch; + const char *name; +}; + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE 1 +#define TP_FLAG_PROFILE 2 + +struct trace_probe { + struct list_head list; + struct kretprobe rp; /* Use rp.kp for kprobe use */ + unsigned long nhit; + unsigned int flags; /* For TP_FLAG_* */ + const char *symbol; /* symbol name */ + struct ftrace_event_call call; + struct trace_event event; + unsigned int nr_args; + struct probe_arg args[]; +}; + +#define SIZEOF_TRACE_PROBE(n) \ + (offsetof(struct trace_probe, args) + \ + (sizeof(struct probe_arg) * (n))) + +static __kprobes int probe_is_return(struct trace_probe *tp) +{ + return tp->rp.handler != NULL; +} + +static __kprobes const char *probe_symbol(struct trace_probe *tp) +{ + return tp->symbol ? tp->symbol : "unknown"; +} + +static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) +{ + int ret = -EINVAL; + + if (ff->func == fetch_argument) + ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data); + else if (ff->func == fetch_register) { + const char *name; + name = regs_query_register_name((unsigned int)((long)ff->data)); + ret = snprintf(buf, n, "%%%s", name); + } else if (ff->func == fetch_stack) + ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); + else if (ff->func == fetch_memory) + ret = snprintf(buf, n, "@0x%p", ff->data); + else if (ff->func == fetch_symbol) { + struct symbol_cache *sc = ff->data; + if (sc->offset) + ret = snprintf(buf, n, "@%s%+ld", sc->symbol, + sc->offset); + else + ret = snprintf(buf, n, "@%s", sc->symbol); + } else if (ff->func == fetch_retvalue) + ret = snprintf(buf, n, "$retval"); + else if (ff->func == fetch_stack_address) + ret = snprintf(buf, n, "$stack"); + else if (ff->func == fetch_indirect) { + struct indirect_fetch_data *id = ff->data; + size_t l = 0; + ret = snprintf(buf, n, "%+ld(", id->offset); + if (ret >= n) + goto end; + l += ret; + ret = probe_arg_string(buf + l, n - l, &id->orig); + if (ret < 0) + goto end; + l += ret; + ret = snprintf(buf + l, n - l, ")"); + ret += l; + } +end: + if (ret >= n) + return -ENOSPC; + return ret; +} + +static int register_probe_event(struct trace_probe *tp); +static void unregister_probe_event(struct trace_probe *tp); + +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_dispatcher(struct kretprobe_instance *ri, + struct pt_regs *regs); + +/* + * Allocate new trace_probe and initialize it (including kprobes). + */ +static struct trace_probe *alloc_trace_probe(const char *group, + const char *event, + void *addr, + const char *symbol, + unsigned long offs, + int nargs, int is_return) +{ + struct trace_probe *tp; + + tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + if (symbol) { + tp->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tp->symbol) + goto error; + tp->rp.kp.symbol_name = tp->symbol; + tp->rp.kp.offset = offs; + } else + tp->rp.kp.addr = addr; + + if (is_return) + tp->rp.handler = kretprobe_dispatcher; + else + tp->rp.kp.pre_handler = kprobe_dispatcher; + + if (!event) + goto error; + tp->call.name = kstrdup(event, GFP_KERNEL); + if (!tp->call.name) + goto error; + + if (!group) + goto error; + tp->call.system = kstrdup(group, GFP_KERNEL); + if (!tp->call.system) + goto error; + + INIT_LIST_HEAD(&tp->list); + return tp; +error: + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); + return ERR_PTR(-ENOMEM); +} + +static void free_probe_arg(struct probe_arg *arg) +{ + if (arg->fetch.func == fetch_symbol) + free_symbol_cache(arg->fetch.data); + else if (arg->fetch.func == fetch_indirect) + free_indirect_fetch_data(arg->fetch.data); + kfree(arg->name); +} + +static void free_trace_probe(struct trace_probe *tp) +{ + int i; + + for (i = 0; i < tp->nr_args; i++) + free_probe_arg(&tp->args[i]); + + kfree(tp->call.system); + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); +} + +static struct trace_probe *find_probe_event(const char *event, + const char *group) +{ + struct trace_probe *tp; + + list_for_each_entry(tp, &probe_list, list) + if (strcmp(tp->call.name, event) == 0 && + strcmp(tp->call.system, group) == 0) + return tp; + return NULL; +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static void unregister_trace_probe(struct trace_probe *tp) +{ + if (probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->rp.kp); + list_del(&tp->list); + unregister_probe_event(tp); +} + +/* Register a trace_probe and probe_event */ +static int register_trace_probe(struct trace_probe *tp) +{ + struct trace_probe *old_tp; + int ret; + + mutex_lock(&probe_lock); + + /* register as an event */ + old_tp = find_probe_event(tp->call.name, tp->call.system); + if (old_tp) { + /* delete old event */ + unregister_trace_probe(old_tp); + free_trace_probe(old_tp); + } + ret = register_probe_event(tp); + if (ret) { + pr_warning("Faild to register probe event(%d)\n", ret); + goto end; + } + + tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + if (probe_is_return(tp)) + ret = register_kretprobe(&tp->rp); + else + ret = register_kprobe(&tp->rp.kp); + + if (ret) { + pr_warning("Could not insert probe(%d)\n", ret); + if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + tp->rp.kp.addr); + ret = -EINVAL; + } + unregister_probe_event(tp); + } else + list_add_tail(&tp->list, &probe_list); +end: + mutex_unlock(&probe_lock); + return ret; +} + +/* Split symbol and offset. */ +static int split_symbol_offset(char *symbol, unsigned long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (tmp) { + /* skip sign because strict_strtol doesn't accept '+' */ + ret = strict_strtoul(tmp + 1, 0, offset); + if (ret) + return ret; + *tmp = '\0'; + } else + *offset = 0; + return 0; +} + +#define PARAM_MAX_ARGS 16 +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + + if (strcmp(arg, "retval") == 0) { + if (is_return) { + ff->func = fetch_retvalue; + ff->data = NULL; + } else + ret = -EINVAL; + } else if (strncmp(arg, "stack", 5) == 0) { + if (arg[5] == '\0') { + ff->func = fetch_stack_address; + ff->data = NULL; + } else if (isdigit(arg[5])) { + ret = strict_strtoul(arg + 5, 10, ¶m); + if (ret || param > PARAM_MAX_STACK) + ret = -EINVAL; + else { + ff->func = fetch_stack; + ff->data = (void *)param; + } + } else + ret = -EINVAL; + } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) { + ret = strict_strtoul(arg + 3, 10, ¶m); + if (ret || param > PARAM_MAX_ARGS) + ret = -EINVAL; + else { + ff->func = fetch_argument; + ff->data = (void *)param; + } + } else + ret = -EINVAL; + return ret; +} + +/* Recursive argument parser */ +static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + long offset; + char *tmp; + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg + 1, ff, is_return); + break; + case '%': /* named register */ + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + ff->func = fetch_register; + ff->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + case '@': /* memory or symbol */ + if (isdigit(arg[1])) { + ret = strict_strtoul(arg + 1, 0, ¶m); + if (ret) + break; + ff->func = fetch_memory; + ff->data = (void *)param; + } else { + ret = split_symbol_offset(arg + 1, &offset); + if (ret) + break; + ff->data = alloc_symbol_cache(arg + 1, offset); + if (ff->data) + ff->func = fetch_symbol; + else + ret = -EINVAL; + } + break; + case '+': /* indirect memory */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) { + ret = -EINVAL; + break; + } + *tmp = '\0'; + ret = strict_strtol(arg + 1, 0, &offset); + if (ret) + break; + if (arg[0] == '-') + offset = -offset; + arg = tmp + 1; + tmp = strrchr(arg, ')'); + if (tmp) { + struct indirect_fetch_data *id; + *tmp = '\0'; + id = kzalloc(sizeof(struct indirect_fetch_data), + GFP_KERNEL); + if (!id) + return -ENOMEM; + id->offset = offset; + ret = __parse_probe_arg(arg, &id->orig, is_return); + if (ret) + kfree(id); + else { + ff->func = fetch_indirect; + ff->data = (void *)id; + } + } else + ret = -EINVAL; + break; + default: + /* TODO: support custom handler */ + ret = -EINVAL; + } + return ret; +} + +/* String length checking wrapper */ +static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +{ + if (strlen(arg) > MAX_ARGSTR_LEN) { + pr_info("Argument is too long.: %s\n", arg); + return -ENOSPC; + } + return __parse_probe_arg(arg, ff, is_return); +} + +/* Return 1 if name is reserved or already used by another argument */ +static int conflict_field_name(const char *name, + struct probe_arg *args, int narg) +{ + int i; + for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) + if (strcmp(reserved_field_names[i], name) == 0) + return 1; + for (i = 0; i < narg; i++) + if (strcmp(args[i].name, name) == 0) + return 1; + return 0; +} + +static int create_trace_probe(int argc, char **argv) +{ + /* + * Argument syntax: + * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] + * Fetch args: + * $argN : fetch Nth of function argument. (N:0-) + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth of stack (N:0-) + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * %REG : fetch register REG + * Indirect memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + */ + struct trace_probe *tp; + int i, ret = 0; + int is_return = 0; + char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; + unsigned long offset = 0; + void *addr = NULL; + char buf[MAX_EVENT_NAME_LEN]; + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } + + if (argv[0][0] == 'p') + is_return = 0; + else if (argv[0][0] == 'r') + is_return = 1; + else { + pr_info("Probe definition must be started with 'p' or 'r'.\n"); + return -EINVAL; + } + + if (argv[0][1] == ':') { + event = &argv[0][2]; + if (strchr(event, '/')) { + group = event; + event = strchr(group, '/') + 1; + event[-1] = '\0'; + if (strlen(group) == 0) { + pr_info("Group name is not specifiled\n"); + return -EINVAL; + } + } + if (strlen(event) == 0) { + pr_info("Event name is not specifiled\n"); + return -EINVAL; + } + } + + if (isdigit(argv[1][0])) { + if (is_return) { + pr_info("Return probe point must be a symbol.\n"); + return -EINVAL; + } + /* an address specified */ + ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + if (ret) { + pr_info("Failed to parse address.\n"); + return ret; + } + } else { + /* a symbol specified */ + symbol = argv[1]; + /* TODO: support .init module functions */ + ret = split_symbol_offset(symbol, &offset); + if (ret) { + pr_info("Failed to parse symbol.\n"); + return ret; + } + if (offset && is_return) { + pr_info("Return probe must be used without offset.\n"); + return -EINVAL; + } + } + argc -= 2; argv += 2; + + /* setup a probe */ + if (!group) + group = KPROBE_EVENT_SYSTEM; + if (!event) { + /* Make a new event name */ + if (symbol) + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", + is_return ? 'r' : 'p', symbol, offset); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", + is_return ? 'r' : 'p', addr); + event = buf; + } + tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + is_return); + if (IS_ERR(tp)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tp)); + return PTR_ERR(tp); + } + + /* parse arguments */ + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) + *arg++ = '\0'; + else + arg = argv[i]; + + if (conflict_field_name(argv[i], tp->args, i)) { + pr_info("Argument%d name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } + + tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + if (!tp->args[i].name) { + pr_info("Failed to allocate argument%d name '%s'.\n", + i, argv[i]); + ret = -ENOMEM; + goto error; + } + + /* Parse fetch argument */ + ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); + if (ret) { + pr_info("Parse error at argument%d. (%d)\n", i, ret); + kfree(tp->args[i].name); + goto error; + } + + tp->nr_args++; + } + + ret = register_trace_probe(tp); + if (ret) + goto error; + return 0; + +error: + free_trace_probe(tp); + return ret; +} + +static void cleanup_all_probes(void) +{ + struct trace_probe *tp; + + mutex_lock(&probe_lock); + /* TODO: Use batch unregistration */ + while (!list_empty(&probe_list)) { + tp = list_entry(probe_list.next, struct trace_probe, list); + unregister_trace_probe(tp); + free_trace_probe(tp); + } + mutex_unlock(&probe_lock); +} + + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&probe_lock); + return seq_list_start(&probe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &probe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&probe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + int i, ret; + char buf[MAX_ARGSTR_LEN + 1]; + + seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); + seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); + + if (!tp->symbol) + seq_printf(m, " 0x%p", tp->rp.kp.addr); + else if (tp->rp.kp.offset) + seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); + else + seq_printf(m, " %s", probe_symbol(tp)); + + for (i = 0; i < tp->nr_args; i++) { + ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); + if (ret < 0) { + pr_warning("Argument%d decoding error(%d).\n", i, ret); + return ret; + } + seq_printf(m, " %s=%s", tp->args[i].name, buf); + } + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + cleanup_all_probes(); + + return seq_open(file, &probes_seq_op); +} + +static int command_trace_probe(const char *buf) +{ + char **argv; + int argc = 0, ret = 0; + + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = create_trace_probe(argc, argv); + + argv_free(argv); + return ret; +} + +#define WRITE_BUFSIZE 128 + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *kbuf, *tmp; + int ret; + size_t done; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = done = 0; + while (done < count) { + size = count - done; + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - kbuf + 1; + } else if (done + size < count) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + if (tmp) + *tmp = '\0'; + + ret = command_trace_probe(kbuf); + if (ret) + goto out; + } + ret = done; +out: + kfree(kbuf); + return ret; +} + +static const struct file_operations kprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + + seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, + tp->rp.kp.nmissed); + + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations kprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Kprobe handler */ +static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct kprobe_trace_entry *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + tp->nhit++; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + + event = trace_current_buffer_lock_reserve(&buffer, call->id, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + return 0; +} + +/* Kretprobe handler */ +static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct kretprobe_trace_entry *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + + event = trace_current_buffer_lock_reserve(&buffer, call->id, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + + return 0; +} + +/* Event entry printers */ +enum print_line_t +print_kprobe_event(struct trace_iterator *iter, int flags) +{ + struct kprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; + int i; + + field = (struct kprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +enum print_line_t +print_kretprobe_event(struct trace_iterator *iter, int flags) +{ + struct kretprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; + int i; + + field = (struct kretprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, " <- ")) + goto partial; + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int probe_event_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_TRACE; + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_event_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_TRACE; + if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} + +static int probe_event_raw_init(struct ftrace_event_call *event_call) +{ + INIT_LIST_HEAD(&event_call->fields); + + return 0; +} + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ + if (ret) \ + return ret; \ + } while (0) + +static int kprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kprobe_trace_entry field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + ret = trace_define_common_fields(event_call); + if (!ret) + return ret; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + return 0; +} + +static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kretprobe_trace_entry field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + ret = trace_define_common_fields(event_call); + if (!ret) + return ret; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + return 0; +} + +static int __probe_event_show_format(struct trace_seq *s, + struct trace_probe *tp, const char *fmt, + const char *arg) +{ + int i; + + /* Show format */ + if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) + return 0; + + if (!trace_seq_printf(s, "\", %s", arg)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) + return 0; + + return trace_seq_puts(s, "\n"); +} + +#undef SHOW_FIELD +#define SHOW_FIELD(type, item, name) \ + do { \ + ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ + "offset:%u;\tsize:%u;\n", name, \ + (unsigned int)offsetof(typeof(field), item),\ + (unsigned int)sizeof(type)); \ + if (!ret) \ + return 0; \ + } while (0) + +static int kprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kprobe_trace_entry field __attribute__((unused)); + int ret, i; + struct trace_probe *tp = (struct trace_probe *)call->data; + + SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); + SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + + /* Show fields */ + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); + trace_seq_puts(s, "\n"); + + return __probe_event_show_format(s, tp, "(%lx)", + "REC->" FIELD_STRING_IP); +} + +static int kretprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kretprobe_trace_entry field __attribute__((unused)); + int ret, i; + struct trace_probe *tp = (struct trace_probe *)call->data; + + SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); + SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); + SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + + /* Show fields */ + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); + trace_seq_puts(s, "\n"); + + return __probe_event_show_format(s, tp, "(%lx <- %lx)", + "REC->" FIELD_STRING_FUNC + ", REC->" FIELD_STRING_RETIP); +} + +#ifdef CONFIG_EVENT_PROFILE + +/* Kprobe profile handler */ +static __kprobes int kprobe_profile_func(struct kprobe *kp, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct ftrace_event_call *call = &tp->call; + struct kprobe_trace_entry *entry; + struct trace_entry *ent; + int size, __size, i, pc, __cpu; + unsigned long irq_flags; + char *trace_buf; + char *raw_data; + int rctx; + + pc = preempt_count(); + __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return 0; + + /* + * Protect the non nmi buffer + * This also protects the rcu read side + */ + local_irq_save(irq_flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + + __cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto end; + + raw_data = per_cpu_ptr(trace_buf, __cpu); + + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + perf_tp_event(call->id, entry->ip, 1, entry, size); + +end: + perf_swevent_put_recursion_context(rctx); +end_recursion: + local_irq_restore(irq_flags); + + return 0; +} + +/* Kretprobe profile handler */ +static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct ftrace_event_call *call = &tp->call; + struct kretprobe_trace_entry *entry; + struct trace_entry *ent; + int size, __size, i, pc, __cpu; + unsigned long irq_flags; + char *trace_buf; + char *raw_data; + int rctx; + + pc = preempt_count(); + __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return 0; + + /* + * Protect the non nmi buffer + * This also protects the rcu read side + */ + local_irq_save(irq_flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + + __cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto end; + + raw_data = per_cpu_ptr(trace_buf, __cpu); + + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kretprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + perf_tp_event(call->id, entry->ret_ip, 1, entry, size); + +end: + perf_swevent_put_recursion_context(rctx); +end_recursion: + local_irq_restore(irq_flags); + + return 0; +} + +static int probe_profile_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_PROFILE; + + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_profile_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_PROFILE; + + if (!(tp->flags & TP_FLAG_TRACE)) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} +#endif /* CONFIG_EVENT_PROFILE */ + + +static __kprobes +int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + + if (tp->flags & TP_FLAG_TRACE) + kprobe_trace_func(kp, regs); +#ifdef CONFIG_EVENT_PROFILE + if (tp->flags & TP_FLAG_PROFILE) + kprobe_profile_func(kp, regs); +#endif /* CONFIG_EVENT_PROFILE */ + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static __kprobes +int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + + if (tp->flags & TP_FLAG_TRACE) + kretprobe_trace_func(ri, regs); +#ifdef CONFIG_EVENT_PROFILE + if (tp->flags & TP_FLAG_PROFILE) + kretprobe_profile_func(ri, regs); +#endif /* CONFIG_EVENT_PROFILE */ + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static int register_probe_event(struct trace_probe *tp) +{ + struct ftrace_event_call *call = &tp->call; + int ret; + + /* Initialize ftrace_event_call */ + if (probe_is_return(tp)) { + tp->event.trace = print_kretprobe_event; + call->raw_init = probe_event_raw_init; + call->show_format = kretprobe_event_show_format; + call->define_fields = kretprobe_event_define_fields; + } else { + tp->event.trace = print_kprobe_event; + call->raw_init = probe_event_raw_init; + call->show_format = kprobe_event_show_format; + call->define_fields = kprobe_event_define_fields; + } + call->event = &tp->event; + call->id = register_ftrace_event(&tp->event); + if (!call->id) + return -ENODEV; + call->enabled = 0; + call->regfunc = probe_event_enable; + call->unregfunc = probe_event_disable; + +#ifdef CONFIG_EVENT_PROFILE + atomic_set(&call->profile_count, -1); + call->profile_enable = probe_profile_enable; + call->profile_disable = probe_profile_disable; +#endif + call->data = tp; + ret = trace_add_event_call(call); + if (ret) { + pr_info("Failed to register kprobe event: %s\n", call->name); + unregister_ftrace_event(&tp->event); + } + return ret; +} + +static void unregister_probe_event(struct trace_probe *tp) +{ + /* tp->event is unregistered in trace_remove_event_call() */ + trace_remove_event_call(&tp->call); +} + +/* Make a debugfs interface for controling probe points */ +static __init int init_kprobe_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + NULL, &kprobe_events_ops); + + /* Event list interface */ + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_events' entry\n"); + + /* Profile interface */ + entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, + NULL, &kprobe_profile_ops); + + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_profile' entry\n"); + return 0; +} +fs_initcall(init_kprobe_trace); + + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} + +static __init int kprobe_trace_self_tests_init(void) +{ + int ret; + int (*target)(int, int, int, int, int, int); + + target = kprobe_trace_selftest_target; + + pr_info("Testing kprobe tracing: "); + + ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " + "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function entry\n"); + + ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " + "$retval"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function return\n"); + + ret = target(1, 2, 3, 4, 5, 6); + + cleanup_all_probes(); + + pr_cont("OK\n"); + return 0; +} + +late_initcall(kprobe_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c new file mode 100644 index 00000000000..ddfa0fd43bc --- /dev/null +++ b/kernel/trace/trace_ksym.c @@ -0,0 +1,550 @@ +/* + * trace_ksym.c - Kernel Symbol Tracer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/fs.h> + +#include "trace_output.h" +#include "trace_stat.h" +#include "trace.h" + +#include <linux/hw_breakpoint.h> +#include <asm/hw_breakpoint.h> + +/* + * For now, let us restrict the no. of symbols traced simultaneously to number + * of available hardware breakpoint registers. + */ +#define KSYM_TRACER_MAX HBP_NUM + +#define KSYM_TRACER_OP_LEN 3 /* rw- */ + +struct trace_ksym { + struct perf_event **ksym_hbp; + struct perf_event_attr attr; +#ifdef CONFIG_PROFILE_KSYM_TRACER + unsigned long counter; +#endif + struct hlist_node ksym_hlist; +}; + +static struct trace_array *ksym_trace_array; + +static unsigned int ksym_filter_entry_count; +static unsigned int ksym_tracing_enabled; + +static HLIST_HEAD(ksym_filter_head); + +static DEFINE_MUTEX(ksym_tracer_mutex); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + +#define MAX_UL_INT 0xffffffff + +void ksym_collect_stats(unsigned long hbp_hit_addr) +{ + struct hlist_node *node; + struct trace_ksym *entry; + + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { + if ((entry->attr.bp_addr == hbp_hit_addr) && + (entry->counter <= MAX_UL_INT)) { + entry->counter++; + break; + } + } + rcu_read_unlock(); +} +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + +void ksym_hbp_handler(struct perf_event *hbp, void *data) +{ + struct ring_buffer_event *event; + struct ksym_trace_entry *entry; + struct pt_regs *regs = data; + struct ring_buffer *buffer; + int pc; + + if (!ksym_tracing_enabled) + return; + + buffer = ksym_trace_array->buffer; + + pc = preempt_count(); + + event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, + sizeof(*entry), 0, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->ip = instruction_pointer(regs); + entry->type = hw_breakpoint_type(hbp); + entry->addr = hw_breakpoint_addr(hbp); + strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + ksym_collect_stats(hw_breakpoint_addr(hbp)); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + + trace_buffer_unlock_commit(buffer, event, 0, pc); +} + +/* Valid access types are represented as + * + * rw- : Set Read/Write Access Breakpoint + * -w- : Set Write Access Breakpoint + * --- : Clear Breakpoints + * --x : Set Execution Break points (Not available yet) + * + */ +static int ksym_trace_get_access_type(char *str) +{ + int access = 0; + + if (str[0] == 'r') + access |= HW_BREAKPOINT_R; + + if (str[1] == 'w') + access |= HW_BREAKPOINT_W; + + if (str[2] == 'x') + access |= HW_BREAKPOINT_X; + + switch (access) { + case HW_BREAKPOINT_R: + case HW_BREAKPOINT_W: + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + return access; + default: + return -EINVAL; + } +} + +/* + * There can be several possible malformed requests and we attempt to capture + * all of them. We enumerate some of the rules + * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. + * i.e. multiple ':' symbols disallowed. Possible uses are of the form + * <module>:<ksym_name>:<op>. + * 2. No delimiter symbol ':' in the input string + * 3. Spurious operator symbols or symbols not in their respective positions + * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file + * 5. Kernel symbol not a part of /proc/kallsyms + * 6. Duplicate requests + */ +static int parse_ksym_trace_str(char *input_string, char **ksymname, + unsigned long *addr) +{ + int ret; + + *ksymname = strsep(&input_string, ":"); + *addr = kallsyms_lookup_name(*ksymname); + + /* Check for malformed request: (2), (1) and (5) */ + if ((!input_string) || + (strlen(input_string) != KSYM_TRACER_OP_LEN) || + (*addr == 0)) + return -EINVAL;; + + ret = ksym_trace_get_access_type(input_string); + + return ret; +} + +int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) +{ + struct trace_ksym *entry; + int ret = -ENOMEM; + + if (ksym_filter_entry_count >= KSYM_TRACER_MAX) { + printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No" + " new requests for tracing can be accepted now.\n", + KSYM_TRACER_MAX); + return -ENOSPC; + } + + entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + hw_breakpoint_init(&entry->attr); + + entry->attr.bp_type = op; + entry->attr.bp_addr = addr; + entry->attr.bp_len = HW_BREAKPOINT_LEN_4; + + ret = -EAGAIN; + entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, + ksym_hbp_handler); + + if (IS_ERR(entry->ksym_hbp)) { + ret = PTR_ERR(entry->ksym_hbp); + printk(KERN_INFO "ksym_tracer request failed. Try again" + " later!!\n"); + goto err; + } + + hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); + ksym_filter_entry_count++; + + return 0; + +err: + kfree(entry); + + return ret; +} + +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + struct trace_seq *s; + ssize_t cnt = 0; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + trace_seq_init(s); + + mutex_lock(&ksym_tracer_mutex); + + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr); + if (entry->attr.bp_type == HW_BREAKPOINT_R) + ret = trace_seq_puts(s, "r--\n"); + else if (entry->attr.bp_type == HW_BREAKPOINT_W) + ret = trace_seq_puts(s, "-w-\n"); + else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) + ret = trace_seq_puts(s, "rw-\n"); + WARN_ON_ONCE(!ret); + } + + cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + mutex_unlock(&ksym_tracer_mutex); + + kfree(s); + + return cnt; +} + +static void __ksym_trace_reset(void) +{ + struct trace_ksym *entry; + struct hlist_node *node, *node1; + + mutex_lock(&ksym_tracer_mutex); + hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, + ksym_hlist) { + unregister_wide_hw_breakpoint(entry->ksym_hbp); + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry); + } + mutex_unlock(&ksym_tracer_mutex); +} + +static ssize_t ksym_trace_filter_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + char *input_string, *ksymname = NULL; + unsigned long ksym_addr = 0; + int ret, op, changed = 0; + + input_string = kzalloc(count + 1, GFP_KERNEL); + if (!input_string) + return -ENOMEM; + + if (copy_from_user(input_string, buffer, count)) { + kfree(input_string); + return -EFAULT; + } + input_string[count] = '\0'; + + strstrip(input_string); + + /* + * Clear all breakpoints if: + * 1: echo > ksym_trace_filter + * 2: echo 0 > ksym_trace_filter + * 3: echo "*:---" > ksym_trace_filter + */ + if (!input_string[0] || !strcmp(input_string, "0") || + !strcmp(input_string, "*:---")) { + __ksym_trace_reset(); + kfree(input_string); + return count; + } + + ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); + if (ret < 0) { + kfree(input_string); + return ret; + } + + mutex_lock(&ksym_tracer_mutex); + + ret = -EINVAL; + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + if (entry->attr.bp_addr == ksym_addr) { + /* Check for malformed request: (6) */ + if (entry->attr.bp_type != op) + changed = 1; + else + goto out; + break; + } + } + if (changed) { + unregister_wide_hw_breakpoint(entry->ksym_hbp); + entry->attr.bp_type = op; + ret = 0; + if (op > 0) { + entry->ksym_hbp = + register_wide_hw_breakpoint(&entry->attr, + ksym_hbp_handler); + if (IS_ERR(entry->ksym_hbp)) + ret = PTR_ERR(entry->ksym_hbp); + else + goto out; + } + /* Error or "symbol:---" case: drop it */ + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry); + goto out; + } else { + /* Check for malformed request: (4) */ + if (op == 0) + goto out; + ret = process_new_ksym_entry(ksymname, op, ksym_addr); + } +out: + mutex_unlock(&ksym_tracer_mutex); + + kfree(input_string); + + if (!ret) + ret = count; + return ret; +} + +static const struct file_operations ksym_tracing_fops = { + .open = tracing_open_generic, + .read = ksym_trace_filter_read, + .write = ksym_trace_filter_write, +}; + +static void ksym_trace_reset(struct trace_array *tr) +{ + ksym_tracing_enabled = 0; + __ksym_trace_reset(); +} + +static int ksym_trace_init(struct trace_array *tr) +{ + int cpu, ret = 0; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + ksym_tracing_enabled = 1; + ksym_trace_array = tr; + + return ret; +} + +static void ksym_trace_print_header(struct seq_file *m) +{ + seq_puts(m, + "# TASK-PID CPU# Symbol " + "Type Function\n"); + seq_puts(m, + "# | | | " + " | |\n"); +} + +static enum print_line_t ksym_trace_output(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct ksym_trace_entry *field; + char str[KSYM_SYMBOL_LEN]; + int ret; + + if (entry->type != TRACE_KSYM) + return TRACE_TYPE_UNHANDLED; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, + entry->pid, iter->cpu, (char *)field->addr); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + switch (field->type) { + case HW_BREAKPOINT_R: + ret = trace_seq_printf(s, " R "); + break; + case HW_BREAKPOINT_W: + ret = trace_seq_printf(s, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + ret = trace_seq_printf(s, " RW "); + break; + default: + return TRACE_TYPE_PARTIAL_LINE; + } + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + sprint_symbol(str, field->ip); + ret = trace_seq_printf(s, "%s\n", str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +struct tracer ksym_tracer __read_mostly = +{ + .name = "ksym_tracer", + .init = ksym_trace_init, + .reset = ksym_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_ksym, +#endif + .print_header = ksym_trace_print_header, + .print_line = ksym_trace_output +}; + +__init static int init_ksym_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + ksym_filter_entry_count = 0; + + entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer, + NULL, &ksym_tracing_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ksym_trace_filter' file\n"); + + return register_tracer(&ksym_tracer); +} +device_initcall(init_ksym_trace); + + +#ifdef CONFIG_PROFILE_KSYM_TRACER +static int ksym_tracer_stat_headers(struct seq_file *m) +{ + seq_puts(m, " Access Type "); + seq_puts(m, " Symbol Counter\n"); + seq_puts(m, " ----------- "); + seq_puts(m, " ------ -------\n"); + return 0; +} + +static int ksym_tracer_stat_show(struct seq_file *m, void *v) +{ + struct hlist_node *stat = v; + struct trace_ksym *entry; + int access_type = 0; + char fn_name[KSYM_NAME_LEN]; + + entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); + + access_type = entry->attr.bp_type; + + switch (access_type) { + case HW_BREAKPOINT_R: + seq_puts(m, " R "); + break; + case HW_BREAKPOINT_W: + seq_puts(m, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + seq_puts(m, " RW "); + break; + default: + seq_puts(m, " NA "); + } + + if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) + seq_printf(m, " %-36s", fn_name); + else + seq_printf(m, " %-36s", "<NA>"); + seq_printf(m, " %15lu\n", entry->counter); + + return 0; +} + +static void *ksym_tracer_stat_start(struct tracer_stat *trace) +{ + return ksym_filter_head.first; +} + +static void * +ksym_tracer_stat_next(void *v, int idx) +{ + struct hlist_node *stat = v; + + return stat->next; +} + +static struct tracer_stat ksym_tracer_stats = { + .name = "ksym_tracer", + .stat_start = ksym_tracer_stat_start, + .stat_next = ksym_tracer_stat_next, + .stat_headers = ksym_tracer_stat_headers, + .stat_show = ksym_tracer_stat_show +}; + +__init static int ksym_tracer_stat_init(void) +{ + int ret; + + ret = register_stat_tracer(&ksym_tracer_stats); + if (ret) { + printk(KERN_WARNING "Warning: could not register " + "ksym tracer stats\n"); + return 1; + } + + return 0; +} +fs_initcall(ksym_tracer_stat_init); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index d2cdbabb4ea..dc98309e839 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: case TRACE_HW_BRANCHES: + case TRACE_KSYM: return 1; } return 0; @@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace, return ret; } #endif /* CONFIG_HW_BRANCH_TRACER */ + +#ifdef CONFIG_KSYM_TRACER +static int ksym_selftest_dummy; + +int +trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + ksym_selftest_dummy = 0; + /* Register the read-write tracing request */ + + ret = process_new_ksym_entry("ksym_selftest_dummy", + HW_BREAKPOINT_R | HW_BREAKPOINT_W, + (unsigned long)(&ksym_selftest_dummy)); + + if (ret < 0) { + printk(KERN_CONT "ksym_trace read-write startup test failed\n"); + goto ret_path; + } + /* Perform a read and a write operation over the dummy variable to + * trigger the tracer + */ + if (ksym_selftest_dummy == 0) + ksym_selftest_dummy++; + + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + /* read & write operations - one each is performed on the dummy variable + * triggering two entries in the trace buffer + */ + if (!ret && count != 2) { + printk(KERN_CONT "Ksym tracer startup test failed"); + ret = -1; + } + +ret_path: + return ret; +} +#endif /* CONFIG_KSYM_TRACER */ + diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ddee9c59373..57501d90096 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -51,32 +51,6 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } -int syscall_name_to_nr(char *name) -{ - int i; - - if (!syscalls_metadata) - return -1; - - for (i = 0; i < NR_syscalls; i++) { - if (syscalls_metadata[i]) { - if (!strcmp(syscalls_metadata[i]->name, name)) - return i; - } - } - return -1; -} - -void set_syscall_enter_id(int num, int id) -{ - syscalls_metadata[num]->enter_id = id; -} - -void set_syscall_exit_id(int num, int id) -{ - syscalls_metadata[num]->exit_id = id; -} - enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) { @@ -93,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags) if (!entry) goto end; - if (entry->enter_id != ent->type) { + if (entry->enter_event->id != ent->type) { WARN_ON_ONCE(1); goto end; } @@ -148,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } - if (entry->exit_id != ent->type) { + if (entry->exit_event->id != ent->type) { WARN_ON_ONCE(1); return TRACE_TYPE_UNHANDLED; } @@ -166,24 +140,19 @@ extern char *__bad_type_size(void); #define SYSCALL_FIELD(type, name) \ sizeof(type) != sizeof(trace.name) ? \ __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) + #type, #name, offsetof(typeof(trace), name), \ + sizeof(trace.name), is_signed_type(type) int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) { int i; - int nr; int ret; - struct syscall_metadata *entry; + struct syscall_metadata *entry = call->data; struct syscall_trace_enter trace; int offset = offsetof(struct syscall_trace_enter, args); - nr = syscall_name_to_nr(call->data); - entry = syscall_nr_to_meta(nr); - - if (!entry) - return 0; - - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr)); if (!ret) return 0; @@ -193,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) entry->args[i]); if (!ret) return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, - sizeof(unsigned long)); + ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" + "\tsigned:%u;\n", offset, + sizeof(unsigned long), + is_signed_type(unsigned long)); if (!ret) return 0; offset += sizeof(unsigned long); @@ -226,8 +197,10 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) struct syscall_trace_exit trace; ret = trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr), SYSCALL_FIELD(long, ret)); if (!ret) @@ -239,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) int syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; - struct syscall_metadata *meta; + struct syscall_metadata *meta = call->data; int ret; - int nr; int i; int offset = offsetof(typeof(trace), args); - nr = syscall_name_to_nr(call->data); - meta = syscall_nr_to_meta(nr); - - if (!meta) - return 0; - ret = trace_define_common_fields(call); if (ret) return ret; + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; + for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i], meta->args[i], offset, @@ -275,7 +245,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0, + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; + + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), FILTER_OTHER); return ret; @@ -302,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, - size, 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->enter_event->id, size, 0, 0); if (!event) return; @@ -334,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, - sizeof(*entry), 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->exit_event->id, sizeof(*entry), 0, 0); if (!event) return; @@ -348,14 +322,12 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -int reg_event_syscall_enter(void *ptr) +int reg_event_syscall_enter(struct ftrace_event_call *call) { int ret = 0; int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -372,13 +344,11 @@ int reg_event_syscall_enter(void *ptr) return ret; } -void unreg_event_syscall_enter(void *ptr) +void unreg_event_syscall_enter(struct ftrace_event_call *call) { int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); @@ -389,14 +359,12 @@ void unreg_event_syscall_enter(void *ptr) mutex_unlock(&syscall_trace_lock); } -int reg_event_syscall_exit(void *ptr) +int reg_event_syscall_exit(struct ftrace_event_call *call) { int ret = 0; int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -413,13 +381,11 @@ int reg_event_syscall_exit(void *ptr) return ret; } -void unreg_event_syscall_exit(void *ptr) +void unreg_event_syscall_exit(struct ftrace_event_call *call) { int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); @@ -430,13 +396,17 @@ void unreg_event_syscall_exit(void *ptr) mutex_unlock(&syscall_trace_lock); } -struct trace_event event_syscall_enter = { - .trace = print_syscall_enter, -}; +int init_syscall_trace(struct ftrace_event_call *call) +{ + int id; -struct trace_event event_syscall_exit = { - .trace = print_syscall_exit, -}; + id = register_ftrace_event(call->event); + if (!id) + return -ENODEV; + call->id = id; + INIT_LIST_HEAD(&call->fields); + return 0; +} int __init init_ftrace_syscalls(void) { @@ -454,6 +424,10 @@ int __init init_ftrace_syscalls(void) for (i = 0; i < NR_syscalls; i++) { addr = arch_syscall_addr(i); meta = find_syscall_meta(addr); + if (!meta) + continue; + + meta->syscall_nr = i; syscalls_metadata[i] = meta; } @@ -473,8 +447,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; unsigned long flags; + char *trace_buf; char *raw_data; int syscall_nr; + int rctx; int size; int cpu; @@ -498,41 +474,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + cpu = smp_processor_id(); - if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); - else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; rec = (struct syscall_trace_enter *) raw_data; tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->enter_id; + rec->ent.type = sys_data->enter_event->id; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_tp_event(sys_data->enter_id, 0, 1, rec, size); + perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(flags); } -int reg_prof_syscall_enter(char *name) +int prof_sysenter_enable(struct ftrace_event_call *call) { int ret = 0; int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return -ENOSYS; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_enter) @@ -548,13 +525,11 @@ int reg_prof_syscall_enter(char *name) return ret; } -void unreg_prof_syscall_enter(char *name) +void prof_sysenter_disable(struct ftrace_event_call *call) { int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); sys_prof_refcount_enter--; @@ -570,7 +545,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) struct syscall_trace_exit *rec; unsigned long flags; int syscall_nr; + char *trace_buf; char *raw_data; + int rctx; int size; int cpu; @@ -596,17 +573,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + cpu = smp_processor_id(); - if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); - else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -614,24 +593,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec = (struct syscall_trace_exit *)raw_data; tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->exit_id; + rec->ent.type = sys_data->exit_event->id; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_tp_event(sys_data->exit_id, 0, 1, rec, size); + perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(flags); } -int reg_prof_syscall_exit(char *name) +int prof_sysexit_enable(struct ftrace_event_call *call) { int ret = 0; int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return -ENOSYS; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_exit) @@ -647,13 +626,11 @@ int reg_prof_syscall_exit(char *name) return ret; } -void unreg_prof_syscall_exit(char *name) +void prof_sysexit_disable(struct ftrace_event_call *call) { int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); sys_prof_refcount_exit--; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 69eae358a72..a2cd77e70d4 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -57,78 +57,47 @@ static int proc_do_uts_string(ctl_table *table, int write, #define proc_do_uts_string NULL #endif - -#ifdef CONFIG_SYSCTL_SYSCALL -/* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ctl_table uts_table; - int r, write; - write = newval && newlen; - memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen); - put_uts(table, write, uts_table.data); - return r; -} -#else -#define sysctl_uts_string NULL -#endif - static struct ctl_table uts_kern_table[] = { { - .ctl_name = KERN_OSTYPE, .procname = "ostype", .data = init_uts_ns.name.sysname, .maxlen = sizeof(init_uts_ns.name.sysname), .mode = 0444, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_OSRELEASE, .procname = "osrelease", .data = init_uts_ns.name.release, .maxlen = sizeof(init_uts_ns.name.release), .mode = 0444, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_VERSION, .procname = "version", .data = init_uts_ns.name.version, .maxlen = sizeof(init_uts_ns.name.version), .mode = 0444, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_NODENAME, .procname = "hostname", .data = init_uts_ns.name.nodename, .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_DOMAINNAME, .procname = "domainname", .data = init_uts_ns.name.domainname, .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, {} }; static struct ctl_table uts_root_table[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, .child = uts_kern_table, |