diff options
Diffstat (limited to 'kernel')
62 files changed, 1662 insertions, 851 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index d1574d47cf2..271fd3119af 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -176,7 +176,7 @@ signing_key.priv signing_key.x509: x509.genkey openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ -batch -x509 -config x509.genkey \ -outform DER -out signing_key.x509 \ - -keyout signing_key.priv + -keyout signing_key.priv 2>&1 @echo "###" @echo "### Key pair generated." @echo "###" diff --git a/kernel/acct.c b/kernel/acct.c index b9bd7f098ee..8d6e145138b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -540,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct, ac.ac_swaps = encode_comp_t(0); /* + * Get freeze protection. If the fs is frozen, just skip the write + * as we could deadlock the system otherwise. + */ + if (!file_start_write_trylock(file)) + goto out; + /* * Kernel segment override to datasegment and write it * to the accounting file. */ @@ -554,6 +560,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, sizeof(acct_t), &file->f_pos); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; set_fs(fs); + file_end_write(file); out: revert_creds(orig_cred); } diff --git a/kernel/audit.c b/kernel/audit.c index 9816a1b96cf..0b084fa44b1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -58,7 +58,7 @@ #ifdef CONFIG_SECURITY #include <linux/security.h> #endif -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/freezer.h> #include <linux/tty.h> #include <linux/pid_namespace.h> @@ -910,7 +910,7 @@ static void audit_receive_skb(struct sk_buff *skb) { struct nlmsghdr *nlh; /* - * len MUST be signed for NLMSG_NEXT to be able to dec it below 0 + * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned */ int len; @@ -919,13 +919,13 @@ static void audit_receive_skb(struct sk_buff *skb) nlh = nlmsg_hdr(skb); len = skb->len; - while (NLMSG_OK(nlh, len)) { + while (nlmsg_ok(nlh, len)) { err = audit_receive_msg(skb, nlh); /* if err or if this message says it wants a response */ if (err || (nlh->nlmsg_flags & NLM_F_ACK)) netlink_ack(skb, nlh, err); - nlh = NLMSG_NEXT(nlh, len); + nlh = nlmsg_next(nlh, &len); } } @@ -1483,7 +1483,7 @@ void audit_log_end(struct audit_buffer *ab) audit_log_lost("rate limit exceeded"); } else { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); + nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; if (audit_pid) { skb_queue_tail(&audit_skb_queue, ab->skb); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d3abce2d645..2a9926275f8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4678,7 +4678,7 @@ out: */ /* TODO: Use a proper seq_file iterator */ -static int proc_cgroup_show(struct seq_file *m, void *v) +int proc_cgroup_show(struct seq_file *m, void *v) { struct pid *pid; struct task_struct *tsk; @@ -4730,19 +4730,6 @@ out: return retval; } -static int cgroup_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cgroup_show, pid); -} - -const struct file_operations proc_cgroup_operations = { - .open = cgroup_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /* Display information about each subsystem and each hierarchy */ static int proc_cgroupstats_show(struct seq_file *m, void *v) { diff --git a/kernel/compat.c b/kernel/compat.c index 19971d8c729..0a09e481b70 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -516,25 +516,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) return 0; } -asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) -{ - struct rusage r; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_getrusage(who, (struct rusage __user *) &r); - set_fs(old_fs); - - if (ret) - return ret; - - if (put_compat_rusage(&r, ru)) - return -EFAULT; - - return 0; -} - COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, @@ -1138,71 +1119,6 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, } #endif -struct compat_sysinfo { - s32 uptime; - u32 loads[3]; - u32 totalram; - u32 freeram; - u32 sharedram; - u32 bufferram; - u32 totalswap; - u32 freeswap; - u16 procs; - u16 pad; - u32 totalhigh; - u32 freehigh; - u32 mem_unit; - char _f[20-2*sizeof(u32)-sizeof(int)]; -}; - -asmlinkage long -compat_sys_sysinfo(struct compat_sysinfo __user *info) -{ - struct sysinfo s; - - do_sysinfo(&s); - - /* Check to see if any memory value is too large for 32-bit and scale - * down if needed - */ - if ((s.totalram >> 32) || (s.totalswap >> 32)) { - int bitcount = 0; - - while (s.mem_unit < PAGE_SIZE) { - s.mem_unit <<= 1; - bitcount++; - } - - s.totalram >>= bitcount; - s.freeram >>= bitcount; - s.sharedram >>= bitcount; - s.bufferram >>= bitcount; - s.totalswap >>= bitcount; - s.freeswap >>= bitcount; - s.totalhigh >>= bitcount; - s.freehigh >>= bitcount; - } - - if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || - __put_user (s.uptime, &info->uptime) || - __put_user (s.loads[0], &info->loads[0]) || - __put_user (s.loads[1], &info->loads[1]) || - __put_user (s.loads[2], &info->loads[2]) || - __put_user (s.totalram, &info->totalram) || - __put_user (s.freeram, &info->freeram) || - __put_user (s.sharedram, &info->sharedram) || - __put_user (s.bufferram, &info->bufferram) || - __put_user (s.totalswap, &info->totalswap) || - __put_user (s.freeswap, &info->freeswap) || - __put_user (s.procs, &info->procs) || - __put_user (s.totalhigh, &info->totalhigh) || - __put_user (s.freehigh, &info->freehigh) || - __put_user (s.mem_unit, &info->mem_unit)) - return -EFAULT; - - return 0; -} - COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, compat_pid_t, pid, struct compat_timespec __user *, interval) diff --git a/kernel/configs.c b/kernel/configs.c index 42e8fa075ee..c18b1f1ae51 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -79,7 +79,7 @@ static int __init ikconfig_init(void) if (!entry) return -ENOMEM; - entry->size = kernel_config_data_size; + proc_set_size(entry, kernel_config_data_size); return 0; } diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 168cf407a25..8b86c0c68ed 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -76,7 +76,16 @@ static void cpu_idle_loop(void) local_irq_disable(); arch_cpu_idle_enter(); - if (cpu_idle_force_poll) { + /* + * In poll mode we reenable interrupts and spin. + * + * Also if we detected in the wakeup from idle + * path that the tick broadcast device expired + * for us, we don't want to go deep idle as we + * know that the IPI is going to arrive right + * away + */ + if (cpu_idle_force_poll || tick_check_broadcast_expired()) { cpu_idle_poll(); } else { current_clr_polling(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 12331120767..64b3f791bbe 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2609,7 +2609,7 @@ void __cpuset_memory_pressure_bump(void) * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ -static int proc_cpuset_show(struct seq_file *m, void *unused_v) +int proc_cpuset_show(struct seq_file *m, void *unused_v) { struct pid *pid; struct task_struct *tsk; @@ -2643,19 +2643,6 @@ out_free: out: return retval; } - -static int cpuset_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cpuset_show, pid); -} - -const struct file_operations proc_cpuset_operations = { - .open = cpuset_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif /* CONFIG_PROC_PID_CPUSET */ /* Display task mems_allowed in /proc/<pid>/status file. */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index c26278fd485..0506d447aed 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -775,7 +775,7 @@ static void sysrq_handle_dbg(int key) static struct sysrq_key_op sysrq_dbg_op = { .handler = sysrq_handle_dbg, - .help_msg = "debug(G)", + .help_msg = "debug(g)", .action_msg = "DEBUG", }; #endif diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 97fddb09762..cd55144270b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -326,11 +326,16 @@ void rb_free(struct ring_buffer *rb) } #else +static int data_page_nr(struct ring_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} struct page * perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { - if (pgoff > (1UL << page_order(rb))) + /* The '>' counts in the user page. */ + if (pgoff > data_page_nr(rb)) return NULL; return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); @@ -350,10 +355,11 @@ static void rb_free_work(struct work_struct *work) int i, nr; rb = container_of(work, struct ring_buffer, work); - nr = 1 << page_order(rb); + nr = data_page_nr(rb); base = rb->user_page; - for (i = 0; i < nr + 1; i++) + /* The '<=' counts in the user page. */ + for (i = 0; i <= nr; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); @@ -387,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; rb->page_order = ilog2(nr_pages); - rb->nr_pages = 1; + rb->nr_pages = !!nr_pages; ring_buffer_init(rb, watermark, flags); diff --git a/kernel/exit.c b/kernel/exit.c index 60bc027c61c..af2eb3cbd49 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -847,7 +847,7 @@ void do_exit(long code) exit_io_context(tsk); if (tsk->splice_pipe) - __free_pipe_info(tsk->splice_pipe); + free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); @@ -1629,9 +1629,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, } put_pid(pid); - - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(5, ret, which, upid, infop, options, ru); return ret; } @@ -1669,8 +1666,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, ret = do_wait(&wo); put_pid(pid); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } diff --git a/kernel/extable.c b/kernel/extable.c index fe35a634bf7..67460b93b1a 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,10 +41,10 @@ u32 __initdata main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - if (main_extable_sort_needed) + if (main_extable_sort_needed) { + pr_notice("Sorting __ex_table...\n"); sort_extable(__start___ex_table, __stop___ex_table); - else - pr_notice("__ex_table already sorted, skipping sort\n"); + } } /* Given an address, look for it in the exception tables. */ diff --git a/kernel/fork.c b/kernel/fork.c index 339f60dfd62..7d40687b143 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1677,10 +1677,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int, tls_val) #endif { - long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); - asmlinkage_protect(5, ret, clone_flags, newsp, - parent_tidptr, child_tidptr, tls_val); - return ret; + return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); } #endif diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index abfd89d687a..fd4b13b131f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -84,6 +84,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_boottime, .resolution = KTIME_LOW_RES, }, + { + .index = HRTIMER_BASE_TAI, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + .resolution = KTIME_LOW_RES, + }, } }; @@ -91,6 +97,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_TAI] = HRTIMER_BASE_TAI, }; static inline int hrtimer_clockid_to_base(clockid_t clock_id) @@ -107,8 +114,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, mono, boot; struct timespec xts, tom, slp; + s32 tai_offset; get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); + tai_offset = timekeeping_get_tai_offset(); xtim = timespec_to_ktime(xts); mono = ktime_add(xtim, timespec_to_ktime(tom)); @@ -116,6 +125,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; + base->clock_base[HRTIMER_BASE_TAI].softirq_time = + ktime_add(xtim, ktime_set(tai_offset, 0)); } /* @@ -276,6 +287,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) } else { unsigned long rem = do_div(nsec, NSEC_PER_SEC); + /* Make sure nsec fits into long */ + if (unlikely(nsec > KTIME_SEC_MAX)) + return (ktime_t){ .tv64 = KTIME_MAX }; + tmp = ktime_set((long)nsec, rem); } @@ -652,8 +667,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets(offs_real, offs_boot); + return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); } /* @@ -1011,7 +1027,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) * * Returns: * 0 on success @@ -1028,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); * hrtimer_start - (re)start an hrtimer on the current CPU * @timer: the timer to be added * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) * * Returns: * 0 on success @@ -1310,6 +1328,8 @@ retry: expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires.tv64 < 0) + expires.tv64 = KTIME_MAX; if (expires.tv64 < expires_next.tv64) expires_next = expires; break; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 397db02209e..19ed5c425c3 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE(file_inode(file))->data; + unsigned int irq = (int)(long)PDE_DATA(file_inode(file)); cpumask_var_t new_value; int err; @@ -131,17 +131,17 @@ static ssize_t irq_affinity_list_proc_write(struct file *file, static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_proc_show, PDE(inode)->data); + return single_open(file, irq_affinity_proc_show, PDE_DATA(inode)); } static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); + return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); } static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); + return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode)); } static const struct file_operations irq_affinity_proc_fops = { @@ -212,7 +212,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, PDE(inode)->data); + return single_open(file, default_affinity_show, PDE_DATA(inode)); } static const struct file_operations default_affinity_proc_fops = { @@ -233,7 +233,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) static int irq_node_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_node_proc_show, PDE(inode)->data); + return single_open(file, irq_node_proc_show, PDE_DATA(inode)); } static const struct file_operations irq_node_proc_fops = { @@ -256,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) static int irq_spurious_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_spurious_proc_show, PDE(inode)->data); + return single_open(file, irq_spurious_proc_show, PDE_DATA(inode)); } static const struct file_operations irq_spurious_proc_fops = { @@ -366,11 +366,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { - if (action->dir) { - struct irq_desc *desc = irq_to_desc(irq); - - remove_proc_entry(action->dir->name, desc->dir); - } + proc_remove(action->dir); } static void register_default_affinity_proc(void) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 2169feeba52..3127ad52cdb 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -84,9 +84,11 @@ static int is_ksym_addr(unsigned long addr) /* * Expand a compressed symbol data into the resulting uncompressed string, + * if uncompressed string is too long (>= maxlen), it will be truncated, * given the offset to where the symbol is in the compressed stream. */ -static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) +static unsigned int kallsyms_expand_symbol(unsigned int off, + char *result, size_t maxlen) { int len, skipped_first = 0; const u8 *tptr, *data; @@ -113,15 +115,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) while (*tptr) { if (skipped_first) { + if (maxlen <= 1) + goto tail; *result = *tptr; result++; + maxlen--; } else skipped_first = 1; tptr++; } } - *result = '\0'; +tail: + if (maxlen) + *result = '\0'; /* Return to offset to the next symbol. */ return off; @@ -176,7 +183,7 @@ unsigned long kallsyms_lookup_name(const char *name) unsigned int off; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf); + off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(namebuf, name) == 0) return kallsyms_addresses[i]; @@ -195,7 +202,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, int ret; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf); + off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); if (ret != 0) return ret; @@ -294,7 +301,8 @@ const char *kallsyms_lookup(unsigned long addr, pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); + kallsyms_expand_symbol(get_symbol_offset(pos), + namebuf, KSYM_NAME_LEN); if (modname) *modname = NULL; return namebuf; @@ -315,7 +323,8 @@ int lookup_symbol_name(unsigned long addr, char *symname) pos = get_symbol_pos(addr, NULL, NULL); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), symname); + kallsyms_expand_symbol(get_symbol_offset(pos), + symname, KSYM_NAME_LEN); return 0; } /* See if it's in a module. */ @@ -333,7 +342,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, pos = get_symbol_pos(addr, size, offset); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), name); + kallsyms_expand_symbol(get_symbol_offset(pos), + name, KSYM_NAME_LEN); modname[0] = '\0'; return 0; } @@ -463,7 +473,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) iter->type = kallsyms_get_symbol_type(off); - off = kallsyms_expand_symbol(off, iter->name); + off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name)); return off - iter->nameoff; } diff --git a/kernel/kexec.c b/kernel/kexec.c index b574920cbd4..59f7b55ba74 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -786,7 +786,7 @@ static int kimage_load_normal_segment(struct kimage *image, struct kexec_segment *segment) { unsigned long maddr; - unsigned long ubytes, mbytes; + size_t ubytes, mbytes; int result; unsigned char __user *buf; @@ -819,13 +819,9 @@ static int kimage_load_normal_segment(struct kimage *image, /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) - uchunk = ubytes; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); result = copy_from_user(ptr, buf, uchunk); kunmap(page); @@ -850,7 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image, * We do things a page at a time for the sake of kmap. */ unsigned long maddr; - unsigned long ubytes, mbytes; + size_t ubytes, mbytes; int result; unsigned char __user *buf; @@ -871,13 +867,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ptr = kmap(page); ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) { - uchunk = ubytes; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + if (mchunk > uchunk) { /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } @@ -1540,14 +1533,13 @@ void vmcoreinfo_append_str(const char *fmt, ...) { va_list args; char buf[0x50]; - int r; + size_t r; va_start(args, fmt); r = vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - if (r + vmcoreinfo_size > vmcoreinfo_max_size) - r = vmcoreinfo_max_size - vmcoreinfo_size; + r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); diff --git a/kernel/kmod.c b/kernel/kmod.c index 56dd34976d7..1296e72e416 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -77,6 +77,7 @@ static void free_modprobe_argv(struct subprocess_info *info) static int call_modprobe(char *module_name, int wait) { + struct subprocess_info *info; static char *envp[] = { "HOME=/", "TERM=linux", @@ -98,8 +99,15 @@ static int call_modprobe(char *module_name, int wait) argv[3] = module_name; /* check free_modprobe_argv() */ argv[4] = NULL; - return call_usermodehelper_fns(modprobe_path, argv, envp, - wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); + info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, + NULL, free_modprobe_argv, NULL); + if (!info) + goto free_module_name; + + return call_usermodehelper_exec(info, wait | UMH_KILLABLE); + +free_module_name: + kfree(module_name); free_argv: kfree(argv); out: @@ -502,14 +510,28 @@ static void helper_unlock(void) * @argv: arg vector for process * @envp: environment for process * @gfp_mask: gfp mask for memory allocation + * @cleanup: a cleanup function + * @init: an init function + * @data: arbitrary context sensitive data * * Returns either %NULL on allocation failure, or a subprocess_info * structure. This should be passed to call_usermodehelper_exec to * exec the process and free the structure. + * + * The init function is used to customize the helper process prior to + * exec. A non-zero return code causes the process to error out, exit, + * and return the failure to the calling process + * + * The cleanup function is just before ethe subprocess_info is about to + * be freed. This can be used for freeing the argv and envp. The + * Function must be runnable in either a process context or the + * context in which call_usermodehelper_exec is called. */ -static struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, - char **envp, gfp_t gfp_mask) + char **envp, gfp_t gfp_mask, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), + void *data) { struct subprocess_info *sub_info; sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); @@ -520,50 +542,27 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->path = path; sub_info->argv = argv; sub_info->envp = envp; + + sub_info->cleanup = cleanup; + sub_info->init = init; + sub_info->data = data; out: return sub_info; } - -/** - * call_usermodehelper_setfns - set a cleanup/init function - * @info: a subprocess_info returned by call_usermodehelper_setup - * @cleanup: a cleanup function - * @init: an init function - * @data: arbitrary context sensitive data - * - * The init function is used to customize the helper process prior to - * exec. A non-zero return code causes the process to error out, exit, - * and return the failure to the calling process - * - * The cleanup function is just before ethe subprocess_info is about to - * be freed. This can be used for freeing the argv and envp. The - * Function must be runnable in either a process context or the - * context in which call_usermodehelper_exec is called. - */ -static -void call_usermodehelper_setfns(struct subprocess_info *info, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), - void *data) -{ - info->cleanup = cleanup; - info->init = init; - info->data = data; -} +EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocessa * @wait: wait for the application to finish and return status. - * when -1 don't wait at all, but you get no useful error back when - * the program couldn't be exec'ed. This makes it safe to call + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call * from interrupt context. * * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). */ -static int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { DECLARE_COMPLETION_ONSTACK(done); @@ -615,31 +614,34 @@ unlock: helper_unlock(); return retval; } +EXPORT_SYMBOL(call_usermodehelper_exec); -/* - * call_usermodehelper_fns() will not run the caller-provided cleanup function - * if a memory allocation failure is experienced. So the caller might need to - * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform - * the necessaary cleanup within the caller. +/** + * call_usermodehelper() - prepare and start a usermode application + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @wait: wait for the application to finish and return status. + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * This function is the equivalent to use call_usermodehelper_setup() and + * call_usermodehelper_exec(). */ -int call_usermodehelper_fns( - char *path, char **argv, char **envp, int wait, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *), void *data) +int call_usermodehelper(char *path, char **argv, char **envp, int wait) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; - info = call_usermodehelper_setup(path, argv, envp, gfp_mask); - + info = call_usermodehelper_setup(path, argv, envp, gfp_mask, + NULL, NULL, NULL); if (info == NULL) return -ENOMEM; - call_usermodehelper_setfns(info, init, cleanup, data); - return call_usermodehelper_exec(info, wait); } -EXPORT_SYMBOL(call_usermodehelper_fns); +EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/kthread.c b/kernel/kthread.c index 16d8ddd268b..760e86df8c2 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> +#include <linux/uaccess.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -135,6 +136,24 @@ void *kthread_data(struct task_struct *task) return to_kthread(task)->data; } +/** + * probe_kthread_data - speculative version of kthread_data() + * @task: possible kthread task in question + * + * @task could be a kthread task. Return the data value specified when it + * was created if accessible. If @task isn't a kthread task or its data is + * inaccessible for any reason, %NULL is returned. This function requires + * that @task itself is safe to dereference. + */ +void *probe_kthread_data(struct task_struct *task) +{ + struct kthread *kthread = to_kthread(task); + void *data = NULL; + + probe_kernel_read(&data, &kthread->data, sizeof(data)); + return data; +} + static void __kthread_parkme(struct kthread *self) { __set_current_state(TASK_PARKED); diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S index 246b4c6e613..4a9a86d12c8 100644 --- a/kernel/modsign_certificate.S +++ b/kernel/modsign_certificate.S @@ -1,15 +1,8 @@ -/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ -#ifndef SYMBOL_PREFIX -#define ASM_SYMBOL(sym) sym -#else -#define PASTE2(x,y) x##y -#define PASTE(x,y) PASTE2(x,y) -#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) -#endif +#include <linux/export.h> #define GLOBAL(name) \ - .globl ASM_SYMBOL(name); \ - ASM_SYMBOL(name): + .globl VMLINUX_SYMBOL(name); \ + VMLINUX_SYMBOL(name): .section ".init.data","aw" diff --git a/kernel/module.c b/kernel/module.c index 0925c9a7197..b049939177f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1209,10 +1209,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, /* Since this should be found in kernel (which can't be removed), * no locking is necessary. */ - if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, + if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL, &crc, true, false)) BUG(); - return check_version(sechdrs, versindex, "module_layout", mod, crc, + return check_version(sechdrs, versindex, + VMLINUX_SYMBOL_STR(module_layout), mod, crc, NULL); } @@ -1861,12 +1862,12 @@ static void free_module(struct module *mod) { trace_module_free(mod); - /* Delete from various lists */ - mutex_lock(&module_mutex); - stop_machine(__unlink_module, mod, NULL); - mutex_unlock(&module_mutex); mod_sysfs_teardown(mod); + /* We leave it in list to prevent duplicate loads, but make sure + * that noone uses it while it's being deconstructed. */ + mod->state = MODULE_STATE_UNFORMED; + /* Remove dynamic debug info */ ddebug_remove_module(mod->name); @@ -1879,6 +1880,11 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); + /* Now we can delete it from the lists */ + mutex_lock(&module_mutex); + stop_machine(__unlink_module, mod, NULL); + mutex_unlock(&module_mutex); + /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); module_free(mod, mod->module_init); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index afc0456f227..364ceab15f0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -22,7 +22,7 @@ #include <linux/pid_namespace.h> #include <net/net_namespace.h> #include <linux/ipc_namespace.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> #include <linux/file.h> #include <linux/syscalls.h> @@ -241,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) const struct proc_ns_operations *ops; struct task_struct *tsk = current; struct nsproxy *new_nsproxy; - struct proc_inode *ei; + struct proc_ns *ei; struct file *file; int err; @@ -250,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) return PTR_ERR(file); err = -EINVAL; - ei = PROC_I(file_inode(file)); + ei = get_proc_ns(file_inode(file)); ops = ei->ns_ops; if (nstype && (ops->type != nstype)) goto out; diff --git a/kernel/panic.c b/kernel/panic.c index 7c57cc9eee2..167ec097ce8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -22,7 +22,6 @@ #include <linux/sysrq.h> #include <linux/init.h> #include <linux/nmi.h> -#include <linux/dmi.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -400,13 +399,8 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { - const char *board; - printk(KERN_WARNING "------------[ cut here ]------------\n"); printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (board) - printk(KERN_WARNING "Hardware name: %s\n", board); if (args) vprintk(args->fmt, args->args); diff --git a/kernel/pid.c b/kernel/pid.c index 047dc626463..0db3e791a06 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> +#include <linux/proc_ns.h> #include <linux/proc_fs.h> #define pid_hashfn(nr, ns) \ @@ -51,9 +52,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -#define BITS_PER_PAGE (PAGE_SIZE*8) -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) - static inline int mk_pid(struct pid_namespace *pid_ns, struct pidmap *map, int off) { @@ -183,15 +181,19 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) break; } if (likely(atomic_read(&map->nr_free))) { - do { + for ( ; ; ) { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); set_last_pid(pid_ns, last, pid); return pid; } offset = find_next_offset(map, offset); + if (offset >= BITS_PER_PAGE) + break; pid = mk_pid(pid_ns, map, offset); - } while (offset < BITS_PER_PAGE && pid < pid_max); + if (pid >= pid_max) + break; + } } if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index bea15bdf82b..6917e8edb48 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -15,12 +15,10 @@ #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> #include <linux/reboot.h> #include <linux/export.h> -#define BITS_PER_PAGE (PAGE_SIZE*8) - struct pid_cache { int nr_ids; char name[16]; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 6edbb2c55c2..424c2d4265c 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -40,38 +40,31 @@ #include <linux/list.h> #include <linux/init.h> #include <linux/compiler.h> -#include <linux/idr.h> +#include <linux/hash.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> #include <linux/syscalls.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <linux/export.h> +#include <linux/hashtable.h> /* - * Management arrays for POSIX timers. Timers are kept in slab memory - * Timer ids are allocated by an external routine that keeps track of the - * id and the timer. The external interface is: - * - * void *idr_find(struct idr *idp, int id); to find timer_id <id> - * int idr_get_new(struct idr *idp, void *ptr); to get a new id and - * related it to <ptr> - * void idr_remove(struct idr *idp, int id); to release <id> - * void idr_init(struct idr *idp); to initialize <idp> - * which we supply. - * The idr_get_new *may* call slab for more memory so it must not be - * called under a spin lock. Likewise idr_remore may release memory - * (but it may be ok to do this under a lock...). - * idr_find is just a memory look up and is quite fast. A -1 return - * indicates that the requested id does not exist. + * Management arrays for POSIX timers. Timers are now kept in static hash table + * with 512 entries. + * Timer ids are allocated by local routine, which selects proper hash head by + * key, constructed from current->signal address and per signal struct counter. + * This keeps timer ids unique per process, but now they can intersect between + * processes. */ /* * Lets keep our timers in a slab cache :-) */ static struct kmem_cache *posix_timers_cache; -static struct idr posix_timers_id; -static DEFINE_SPINLOCK(idr_lock); + +static DEFINE_HASHTABLE(posix_timers_hashtable, 9); +static DEFINE_SPINLOCK(hash_lock); /* * we assume that the new SIGEV_THREAD_ID shares no bits with the other @@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); __timr; \ }) +static int hash(struct signal_struct *sig, unsigned int nr) +{ + return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); +} + +static struct k_itimer *__posix_timers_find(struct hlist_head *head, + struct signal_struct *sig, + timer_t id) +{ + struct k_itimer *timer; + + hlist_for_each_entry_rcu(timer, head, t_hash) { + if ((timer->it_signal == sig) && (timer->it_id == id)) + return timer; + } + return NULL; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + + return __posix_timers_find(head, sig, id); +} + +static int posix_timer_add(struct k_itimer *timer) +{ + struct signal_struct *sig = current->signal; + int first_free_id = sig->posix_timer_id; + struct hlist_head *head; + int ret = -ENOENT; + + do { + spin_lock(&hash_lock); + head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; + if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + hlist_add_head_rcu(&timer->t_hash, head); + ret = sig->posix_timer_id; + } + if (++sig->posix_timer_id < 0) + sig->posix_timer_id = 0; + if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) + /* Loop over all possible ids completed */ + ret = -EAGAIN; + spin_unlock(&hash_lock); + } while (ret == -ENOENT); + return ret; +} + static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) { spin_unlock_irqrestore(&timr->it_lock, flags); @@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) return 0; } +static int posix_get_tai(clockid_t which_clock, struct timespec *tp) +{ + timekeeping_clocktai(tp); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) @@ -261,6 +309,16 @@ static __init int init_posix_timers(void) .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, }; + struct k_clock clock_tai = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; struct k_clock clock_boottime = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_boottime, @@ -278,11 +336,11 @@ static __init int init_posix_timers(void) posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); + posix_timers_register_clock(CLOCK_TAI, &clock_tai); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, NULL); - idr_init(&posix_timers_id); return 0; } @@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) { if (it_id_set) { unsigned long flags; - spin_lock_irqsave(&idr_lock, flags); - idr_remove(&posix_timers_id, tmr->it_id); - spin_unlock_irqrestore(&idr_lock, flags); + spin_lock_irqsave(&hash_lock, flags); + hlist_del_rcu(&tmr->t_hash); + spin_unlock_irqrestore(&hash_lock, flags); } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); @@ -552,22 +610,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, return -EAGAIN; spin_lock_init(&new_timer->it_lock); - - idr_preload(GFP_KERNEL); - spin_lock_irq(&idr_lock); - error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT); - spin_unlock_irq(&idr_lock); - idr_preload_end(); - if (error < 0) { - /* - * Weird looking, but we return EAGAIN if the IDR is - * full (proper POSIX return value for this) - */ - if (error == -ENOSPC) - error = -EAGAIN; + new_timer_id = posix_timer_add(new_timer); + if (new_timer_id < 0) { + error = new_timer_id; goto out; } - new_timer_id = error; it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; @@ -645,7 +692,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) return NULL; rcu_read_lock(); - timr = idr_find(&posix_timers_id, (int)timer_id); + timr = posix_timer_by_id(timer_id); if (timr) { spin_lock_irqsave(&timr->it_lock, *flags); if (timr->it_signal == current->signal) { diff --git a/kernel/power/console.c b/kernel/power/console.c index b1dc456474b..463aa673675 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -4,6 +4,7 @@ * Originally from swsusp. */ +#include <linux/console.h> #include <linux/vt_kern.h> #include <linux/kbd_kern.h> #include <linux/vt.h> @@ -14,8 +15,120 @@ static int orig_fgconsole, orig_kmsg; +static DEFINE_MUTEX(vt_switch_mutex); + +struct pm_vt_switch { + struct list_head head; + struct device *dev; + bool required; +}; + +static LIST_HEAD(pm_vt_switch_list); + + +/** + * pm_vt_switch_required - indicate VT switch at suspend requirements + * @dev: device + * @required: if true, caller needs VT switch at suspend/resume time + * + * The different console drivers may or may not require VT switches across + * suspend/resume, depending on how they handle restoring video state and + * what may be running. + * + * Drivers can indicate support for switchless suspend/resume, which can + * save time and flicker, by using this routine and passing 'false' as + * the argument. If any loaded driver needs VT switching, or the + * no_console_suspend argument has been passed on the command line, VT + * switches will occur. + */ +void pm_vt_switch_required(struct device *dev, bool required) +{ + struct pm_vt_switch *entry, *tmp; + + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + /* already registered, update requirement */ + tmp->required = required; + goto out; + } + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out; + + entry->required = required; + entry->dev = dev; + + list_add(&entry->head, &pm_vt_switch_list); +out: + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_required); + +/** + * pm_vt_switch_unregister - stop tracking a device's VT switching needs + * @dev: device + * + * Remove @dev from the vt switch list. + */ +void pm_vt_switch_unregister(struct device *dev) +{ + struct pm_vt_switch *tmp; + + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + list_del(&tmp->head); + break; + } + } + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_unregister); + +/* + * There are three cases when a VT switch on suspend/resume are required: + * 1) no driver has indicated a requirement one way or another, so preserve + * the old behavior + * 2) console suspend is disabled, we want to see debug messages across + * suspend/resume + * 3) any registered driver indicates it needs a VT switch + * + * If none of these conditions is present, meaning we have at least one driver + * that doesn't need the switch, and none that do, we can avoid it to make + * resume look a little prettier (and suspend too, but that's usually hidden, + * e.g. when closing the lid on a laptop). + */ +static bool pm_vt_switch(void) +{ + struct pm_vt_switch *entry; + bool ret = true; + + mutex_lock(&vt_switch_mutex); + if (list_empty(&pm_vt_switch_list)) + goto out; + + if (!console_suspend_enabled) + goto out; + + list_for_each_entry(entry, &pm_vt_switch_list, head) { + if (entry->required) + goto out; + } + + ret = false; +out: + mutex_unlock(&vt_switch_mutex); + return ret; +} + int pm_prepare_console(void) { + if (!pm_vt_switch()) + return 0; + orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); if (orig_fgconsole < 0) return 1; @@ -26,6 +139,9 @@ int pm_prepare_console(void) void pm_restore_console(void) { + if (!pm_vt_switch()) + return; + if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); vt_kmsg_redirect(orig_kmsg); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 68197a4e8fc..7ef6866b521 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -32,7 +32,7 @@ static void handle_poweroff(int key) static struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, - .help_msg = "powerOff", + .help_msg = "poweroff(o)", .action_msg = "Power Off", .enable_mask = SYSRQ_ENABLE_BOOT, }; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index d4feda084a3..bef86d121eb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -76,8 +76,20 @@ EXPORT_SYMBOL_GPL(suspend_set_ops); bool valid_state(suspend_state_t state) { - if (state == PM_SUSPEND_FREEZE) - return true; + if (state == PM_SUSPEND_FREEZE) { +#ifdef CONFIG_PM_DEBUG + if (pm_test_level != TEST_NONE && + pm_test_level != TEST_FREEZER && + pm_test_level != TEST_DEVICES && + pm_test_level != TEST_PLATFORM) { + printk(KERN_WARNING "Unsupported pm_test mode for " + "freeze state, please choose " + "none/freezer/devices/platform.\n"); + return false; + } +#endif + return true; + } /* * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel * support and need to be valid to the lowlevel @@ -184,6 +196,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_wake; } + if (suspend_test(TEST_PLATFORM)) + goto Platform_wake; + /* * PM_SUSPEND_FREEZE equals * frozen processes + suspended devices + idle processors. @@ -195,9 +210,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_wake; } - if (suspend_test(TEST_PLATFORM)) - goto Platform_wake; - error = disable_nonboot_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; diff --git a/kernel/printk.c b/kernel/printk.c index 376914e2869..96dcfcd9a2d 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -43,6 +43,7 @@ #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> +#include <linux/utsname.h> #include <asm/uaccess.h> @@ -2849,4 +2850,65 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) raw_spin_unlock_irqrestore(&logbuf_lock, flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + +static char dump_stack_arch_desc_str[128]; + +/** + * dump_stack_set_arch_desc - set arch-specific str to show with task dumps + * @fmt: printf-style format string + * @...: arguments for the format string + * + * The configured string will be printed right after utsname during task + * dumps. Usually used to add arch-specific system identifiers. If an + * arch wants to make use of such an ID string, it should initialize this + * as soon as possible during boot. + */ +void __init dump_stack_set_arch_desc(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), + fmt, args); + va_end(args); +} + +/** + * dump_stack_print_info - print generic debug info for dump_stack() + * @log_lvl: log level + * + * Arch-specific dump_stack() implementations can use this function to + * print out the same debug information as the generic dump_stack(). + */ +void dump_stack_print_info(const char *log_lvl) +{ + printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n", + log_lvl, raw_smp_processor_id(), current->pid, current->comm, + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + + if (dump_stack_arch_desc_str[0] != '\0') + printk("%sHardware name: %s\n", + log_lvl, dump_stack_arch_desc_str); + + print_worker_info(log_lvl, current); +} + +/** + * show_regs_print_info - print generic debug info for show_regs() + * @log_lvl: log level + * + * show_regs() implementations can use this function to print out generic + * debug information. + */ +void show_regs_print_info(const char *log_lvl) +{ + dump_stack_print_info(log_lvl); + + printk("%stask: %p ti: %p task.ti: %p\n", + log_lvl, current, current_thread_info(), + task_thread_info(current)); +} + #endif diff --git a/kernel/profile.c b/kernel/profile.c index dc3384ee874..0bf40073766 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -462,10 +462,10 @@ static const struct file_operations prof_cpu_mask_proc_fops = { .write = prof_cpu_mask_proc_write, }; -void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) +void create_prof_cpu_mask(void) { /* create /proc/irq/prof_cpu_mask */ - proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); + proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops); } /* @@ -600,7 +600,7 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ NULL, &proc_profile_operations); if (!entry) return 0; - entry->size = (1+prof_len) * sizeof(atomic_t); + proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); hotcpu_notifier(profile_cpu_callback, 0); return 0; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index acbd28424d8..17ae54da0ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -24,6 +24,7 @@ #include <linux/regset.h> #include <linux/hw_breakpoint.h> #include <linux/cn_proc.h> +#include <linux/compat.h> static int ptrace_trapping_sleep_fn(void *flags) @@ -618,6 +619,81 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) return error; } +static int ptrace_peek_siginfo(struct task_struct *child, + unsigned long addr, + unsigned long data) +{ + struct ptrace_peeksiginfo_args arg; + struct sigpending *pending; + struct sigqueue *q; + int ret, i; + + ret = copy_from_user(&arg, (void __user *) addr, + sizeof(struct ptrace_peeksiginfo_args)); + if (ret) + return -EFAULT; + + if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED) + return -EINVAL; /* unknown flags */ + + if (arg.nr < 0) + return -EINVAL; + + if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) + pending = &child->signal->shared_pending; + else + pending = &child->pending; + + for (i = 0; i < arg.nr; ) { + siginfo_t info; + s32 off = arg.off + i; + + spin_lock_irq(&child->sighand->siglock); + list_for_each_entry(q, &pending->list, list) { + if (!off--) { + copy_siginfo(&info, &q->info); + break; + } + } + spin_unlock_irq(&child->sighand->siglock); + + if (off >= 0) /* beyond the end of the list */ + break; + +#ifdef CONFIG_COMPAT + if (unlikely(is_compat_task())) { + compat_siginfo_t __user *uinfo = compat_ptr(data); + + ret = copy_siginfo_to_user32(uinfo, &info); + ret |= __put_user(info.si_code, &uinfo->si_code); + } else +#endif + { + siginfo_t __user *uinfo = (siginfo_t __user *) data; + + ret = copy_siginfo_to_user(uinfo, &info); + ret |= __put_user(info.si_code, &uinfo->si_code); + } + + if (ret) { + ret = -EFAULT; + break; + } + + data += sizeof(siginfo_t); + i++; + + if (signal_pending(current)) + break; + + cond_resched(); + } + + if (i > 0) + return i; + + return ret; +} #ifdef PTRACE_SINGLESTEP #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) @@ -748,6 +824,10 @@ int ptrace_request(struct task_struct *child, long request, ret = put_user(child->ptrace_message, datalp); break; + case PTRACE_PEEKSIGINFO: + ret = ptrace_peek_siginfo(child, addr, data); + break; + case PTRACE_GETSIGINFO: ret = ptrace_getsiginfo(child, &siginfo); if (!ret) diff --git a/kernel/range.c b/kernel/range.c index 9b8ae2d6ed6..071b0ab455c 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -97,7 +97,8 @@ void subtract_range(struct range *range, int az, u64 start, u64 end) range[i].end = range[j].end; range[i].start = end; } else { - printk(KERN_ERR "run of slot in ranges\n"); + pr_err("%s: run out of slot in ranges\n", + __func__); } range[j].end = start; continue; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 71bd7337d0c..170814dc418 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1706,7 +1706,7 @@ static void rcu_prepare_for_idle(int cpu) return; /* If this is a no-CBs CPU, no callbacks, just return. */ - if (is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(cpu)) return; /* @@ -1748,7 +1748,7 @@ static void rcu_cleanup_after_idle(int cpu) struct rcu_data *rdp; struct rcu_state *rsp; - if (is_nocb_cpu(cpu)) + if (rcu_is_nocb_cpu(cpu)) return; rcu_try_advance_all_cbs(); for_each_rcu_flavor(rsp) { diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 49099e81c87..cf6c1741293 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -95,7 +95,7 @@ static const struct file_operations rcubarrier_fops = { .open = rcubarrier_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; #ifdef CONFIG_RCU_BOOST @@ -206,7 +206,7 @@ static const struct file_operations rcuexp_fops = { .open = rcuexp_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; #ifdef CONFIG_RCU_BOOST @@ -306,7 +306,7 @@ static const struct file_operations rcuhier_fops = { .open = rcuhier_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) @@ -348,7 +348,7 @@ static const struct file_operations rcugp_fops = { .open = rcugp_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) diff --git a/kernel/relay.c b/kernel/relay.c index 01ab081ac53..eef0d113b79 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -588,7 +588,7 @@ struct rchan *relay_open(const char *base_filename, chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; - chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); + chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs); chan->parent = parent; chan->private_data = private_data; if (base_filename) { @@ -1099,8 +1099,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, static int subbuf_read_actor(size_t read_start, struct rchan_buf *buf, size_t avail, - read_descriptor_t *desc, - read_actor_t actor) + read_descriptor_t *desc) { void *from; int ret = 0; @@ -1121,15 +1120,13 @@ static int subbuf_read_actor(size_t read_start, typedef int (*subbuf_actor_t) (size_t read_start, struct rchan_buf *buf, size_t avail, - read_descriptor_t *desc, - read_actor_t actor); + read_descriptor_t *desc); /* * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries */ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, subbuf_actor_t subbuf_actor, - read_actor_t actor, read_descriptor_t *desc) { struct rchan_buf *buf = filp->private_data; @@ -1150,7 +1147,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, break; avail = min(desc->count, avail); - ret = subbuf_actor(read_start, buf, avail, desc, actor); + ret = subbuf_actor(read_start, buf, avail, desc); if (desc->error < 0) break; @@ -1174,8 +1171,7 @@ static ssize_t relay_file_read(struct file *filp, desc.count = count; desc.arg.buf = buffer; desc.error = 0; - return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, - NULL, &desc); + return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc); } static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3bdf986a091..58453b8272f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4653,6 +4653,7 @@ void sched_show_task(struct task_struct *p) task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); + print_worker_info(KERN_INFO, p); show_stack(p, NULL); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ea32f02bf2c..cc2dc3eea8a 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -506,34 +506,47 @@ void account_idle_ticks(unsigned long ticks) } /* - * Perform (stime * rtime) / total with reduced chances - * of multiplication overflows by using smaller factors - * like quotient and remainders of divisions between - * rtime and total. + * Perform (stime * rtime) / total, but avoid multiplication overflow by + * loosing precision when the numbers are big. */ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) { - u64 rem, res, scaled; + u64 scaled; - if (rtime >= total) { - /* - * Scale up to rtime / total then add - * the remainder scaled to stime / total. - */ - res = div64_u64_rem(rtime, total, &rem); - scaled = stime * res; - scaled += div64_u64(stime * rem, total); - } else { - /* - * Same in reverse: scale down to total / rtime - * then substract that result scaled to - * to the remaining part. - */ - res = div64_u64_rem(total, rtime, &rem); - scaled = div64_u64(stime, res); - scaled -= div64_u64(scaled * rem, total); + for (;;) { + /* Make sure "rtime" is the bigger of stime/rtime */ + if (stime > rtime) { + u64 tmp = rtime; rtime = stime; stime = tmp; + } + + /* Make sure 'total' fits in 32 bits */ + if (total >> 32) + goto drop_precision; + + /* Does rtime (and thus stime) fit in 32 bits? */ + if (!(rtime >> 32)) + break; + + /* Can we just balance rtime/stime rather than dropping bits? */ + if (stime >> 31) + goto drop_precision; + + /* We can grow stime and shrink rtime and try to make them both fit */ + stime <<= 1; + rtime >>= 1; + continue; + +drop_precision: + /* We drop from rtime, it has more bits than stime */ + rtime >>= 1; + total >>= 1; } + /* + * Make sure gcc understands that this is a 32x32->64 multiply, + * followed by a 64/32->64 divide. + */ + scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); return (__force cputime_t) scaled; } @@ -545,7 +558,7 @@ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) { - cputime_t rtime, stime, total; + cputime_t rtime, stime, utime, total; if (vtime_accounting_enabled()) { *ut = curr->utime; @@ -568,13 +581,21 @@ static void cputime_adjust(struct task_cputime *curr, */ rtime = nsecs_to_cputime(curr->sum_exec_runtime); - if (!rtime) { - stime = 0; - } else if (!total) { - stime = rtime; - } else { + /* + * Update userspace visible utime/stime values only if actual execution + * time is bigger than already exported. Note that can happen, that we + * provided bigger values due to scaling inaccuracy on big numbers. + */ + if (prev->stime + prev->utime >= rtime) + goto out; + + if (total) { stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)total); + utime = rtime - stime; + } else { + stime = rtime; + utime = 0; } /* @@ -583,8 +604,9 @@ static void cputime_adjust(struct task_cputime *curr, * Let's enforce monotonicity. */ prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, rtime - prev->stime); + prev->utime = max(prev->utime, utime); +out: *ut = prev->utime; *st = prev->stime; } diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index e036eda1a9c..da98af347e8 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -130,16 +130,11 @@ static int schedstat_open(struct inode *inode, struct file *file) return seq_open(file, &schedstat_sops); } -static int schedstat_release(struct inode *inode, struct file *file) -{ - return 0; -}; - static const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, - .release = schedstat_release, + .release = seq_release, }; static int __init proc_schedstat_init(void) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5af44b59377..b7a10048a32 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -160,6 +160,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) case BPF_S_ALU_AND_X: case BPF_S_ALU_OR_K: case BPF_S_ALU_OR_X: + case BPF_S_ALU_XOR_K: + case BPF_S_ALU_XOR_X: case BPF_S_ALU_LSH_K: case BPF_S_ALU_LSH_X: case BPF_S_ALU_RSH_K: diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 4567fc020fe..6815171a4ff 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -193,7 +193,7 @@ EXPORT_SYMBOL(up); struct semaphore_waiter { struct list_head list; struct task_struct *task; - int up; + bool up; }; /* @@ -209,12 +209,12 @@ static inline int __sched __down_common(struct semaphore *sem, long state, list_add_tail(&waiter.list, &sem->wait_list); waiter.task = task; - waiter.up = 0; + waiter.up = false; for (;;) { if (signal_pending_state(state, task)) goto interrupted; - if (timeout <= 0) + if (unlikely(timeout <= 0)) goto timed_out; __set_task_state(task, state); raw_spin_unlock_irq(&sem->lock); @@ -258,6 +258,6 @@ static noinline void __sched __up(struct semaphore *sem) struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, struct semaphore_waiter, list); list_del(&waiter->list); - waiter->up = 1; + waiter->up = true; wake_up_process(waiter->task); } diff --git a/kernel/signal.c b/kernel/signal.c index 598dc06be42..113411bfe8b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -32,6 +32,7 @@ #include <linux/user_namespace.h> #include <linux/uprobes.h> #include <linux/compat.h> +#include <linux/cn_proc.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -854,12 +855,14 @@ static void ptrace_trap_notify(struct task_struct *t) * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ -static int prepare_signal(int sig, struct task_struct *p, bool force) +static bool prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; - if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { + if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { + if (signal->flags & SIGNAL_GROUP_COREDUMP) + return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. */ @@ -1160,8 +1163,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, static void print_fatal_signal(int signr) { struct pt_regs *regs = signal_pt_regs(); - printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n", - current->comm, task_pid_nr(current), signr); + printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr); #if defined(__i386__) && !defined(__arch_um__) printk(KERN_INFO "code at %08lx: ", regs->ip); @@ -2350,6 +2352,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) print_fatal_signal(info->si_signo); + proc_coredump_connector(current); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with diff --git a/kernel/smp.c b/kernel/smp.c index 8e451f3ff51..4dba0f7b72a 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -100,16 +100,16 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void csd_lock_wait(struct call_single_data *data) +static void csd_lock_wait(struct call_single_data *csd) { - while (data->flags & CSD_FLAG_LOCK) + while (csd->flags & CSD_FLAG_LOCK) cpu_relax(); } -static void csd_lock(struct call_single_data *data) +static void csd_lock(struct call_single_data *csd) { - csd_lock_wait(data); - data->flags = CSD_FLAG_LOCK; + csd_lock_wait(csd); + csd->flags |= CSD_FLAG_LOCK; /* * prevent CPU from reordering the above assignment @@ -119,16 +119,16 @@ static void csd_lock(struct call_single_data *data) smp_mb(); } -static void csd_unlock(struct call_single_data *data) +static void csd_unlock(struct call_single_data *csd) { - WARN_ON(!(data->flags & CSD_FLAG_LOCK)); + WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ smp_mb(); - data->flags &= ~CSD_FLAG_LOCK; + csd->flags &= ~CSD_FLAG_LOCK; } /* @@ -137,7 +137,7 @@ static void csd_unlock(struct call_single_data *data) * ->func, ->info, and ->flags set. */ static -void generic_exec_single(int cpu, struct call_single_data *data, int wait) +void generic_exec_single(int cpu, struct call_single_data *csd, int wait) { struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); unsigned long flags; @@ -145,7 +145,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) raw_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); - list_add_tail(&data->list, &dst->list); + list_add_tail(&csd->list, &dst->list); raw_spin_unlock_irqrestore(&dst->lock, flags); /* @@ -163,7 +163,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) arch_send_call_function_single_ipi(cpu); if (wait) - csd_lock_wait(data); + csd_lock_wait(csd); } /* @@ -173,7 +173,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) void generic_smp_call_function_single_interrupt(void) { struct call_single_queue *q = &__get_cpu_var(call_single_queue); - unsigned int data_flags; LIST_HEAD(list); /* @@ -186,25 +185,26 @@ void generic_smp_call_function_single_interrupt(void) raw_spin_unlock(&q->lock); while (!list_empty(&list)) { - struct call_single_data *data; + struct call_single_data *csd; + unsigned int csd_flags; - data = list_entry(list.next, struct call_single_data, list); - list_del(&data->list); + csd = list_entry(list.next, struct call_single_data, list); + list_del(&csd->list); /* - * 'data' can be invalid after this call if flags == 0 + * 'csd' can be invalid after this call if flags == 0 * (when called through generic_exec_single()), * so save them away before making the call: */ - data_flags = data->flags; + csd_flags = csd->flags; - data->func(data->info); + csd->func(csd->info); /* * Unlocked CSDs are valid through generic_exec_single(): */ - if (data_flags & CSD_FLAG_LOCK) - csd_unlock(data); + if (csd_flags & CSD_FLAG_LOCK) + csd_unlock(csd); } } @@ -249,16 +249,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, local_irq_restore(flags); } else { if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *data = &d; + struct call_single_data *csd = &d; if (!wait) - data = &__get_cpu_var(csd_data); + csd = &__get_cpu_var(csd_data); - csd_lock(data); + csd_lock(csd); - data->func = func; - data->info = info; - generic_exec_single(cpu, data, wait); + csd->func = func; + csd->info = info; + generic_exec_single(cpu, csd, wait); } else { err = -ENXIO; /* CPU not online */ } @@ -325,7 +325,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); * pre-allocated data structure. Useful for embedding @data inside * other structures, for instance. */ -void __smp_call_function_single(int cpu, struct call_single_data *data, +void __smp_call_function_single(int cpu, struct call_single_data *csd, int wait) { unsigned int this_cpu; @@ -343,11 +343,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, if (cpu == this_cpu) { local_irq_save(flags); - data->func(data->info); + csd->func(csd->info); local_irq_restore(flags); } else { - csd_lock(data); - generic_exec_single(cpu, data, wait); + csd_lock(csd); + generic_exec_single(cpu, csd, wait); } put_cpu(); } @@ -369,7 +369,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { - struct call_function_data *data; + struct call_function_data *cfd; int cpu, next_cpu, this_cpu = smp_processor_id(); /* @@ -401,24 +401,24 @@ void smp_call_function_many(const struct cpumask *mask, return; } - data = &__get_cpu_var(cfd_data); + cfd = &__get_cpu_var(cfd_data); - cpumask_and(data->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, data->cpumask); + cpumask_and(cfd->cpumask, mask, cpu_online_mask); + cpumask_clear_cpu(this_cpu, cfd->cpumask); /* Some callers race with other cpus changing the passed mask */ - if (unlikely(!cpumask_weight(data->cpumask))) + if (unlikely(!cpumask_weight(cfd->cpumask))) return; /* - * After we put an entry into the list, data->cpumask - * may be cleared again when another CPU sends another IPI for - * a SMP function call, so data->cpumask will be zero. + * After we put an entry into the list, cfd->cpumask may be cleared + * again when another CPU sends another IPI for a SMP function call, so + * cfd->cpumask will be zero. */ - cpumask_copy(data->cpumask_ipi, data->cpumask); + cpumask_copy(cfd->cpumask_ipi, cfd->cpumask); - for_each_cpu(cpu, data->cpumask) { - struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); + for_each_cpu(cpu, cfd->cpumask) { + struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); unsigned long flags; @@ -433,12 +433,13 @@ void smp_call_function_many(const struct cpumask *mask, } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(data->cpumask_ipi); + arch_send_call_function_ipi_mask(cfd->cpumask_ipi); if (wait) { - for_each_cpu(cpu, data->cpumask) { - struct call_single_data *csd = - per_cpu_ptr(data->csd, cpu); + for_each_cpu(cpu, cfd->cpumask) { + struct call_single_data *csd; + + csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); } } diff --git a/kernel/softirq.c b/kernel/softirq.c index 51a09d56e78..b5197dcb0da 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -629,8 +629,7 @@ static void remote_softirq_receive(void *data) unsigned long flags; int softirq; - softirq = cp->priv; - + softirq = *(int *)cp->info; local_irq_save(flags); __local_trigger(cp, softirq); local_irq_restore(flags); @@ -640,9 +639,8 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir { if (cpu_online(cpu)) { cp->func = remote_softirq_receive; - cp->info = cp; + cp->info = &softirq; cp->flags = 0; - cp->priv = softirq; __smp_call_function_single(cpu, cp, 0); return 0; diff --git a/kernel/sys.c b/kernel/sys.c index 0da73cf73e6..b95d3c72ba2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -49,6 +49,11 @@ #include <linux/user_namespace.h> #include <linux/binfmts.h> +#include <linux/sched.h> +#include <linux/rcupdate.h> +#include <linux/uidgid.h> +#include <linux/cred.h> + #include <linux/kmsg_dump.h> /* Move somewhere else to avoid recompiling? */ #include <generated/utsrelease.h> @@ -1044,6 +1049,67 @@ change_okay: return old_fsgid; } +/** + * sys_getpid - return the thread group id of the current process + * + * Note, despite the name, this returns the tgid not the pid. The tgid and + * the pid are identical unless CLONE_THREAD was specified on clone() in + * which case the tgid is the same in all threads of the same group. + * + * This is SMP safe as current->tgid does not change. + */ +SYSCALL_DEFINE0(getpid) +{ + return task_tgid_vnr(current); +} + +/* Thread ID - the internal kernel "pid" */ +SYSCALL_DEFINE0(gettid) +{ + return task_pid_vnr(current); +} + +/* + * Accessing ->real_parent is not SMP-safe, it could + * change from under us. However, we can use a stale + * value of ->real_parent under rcu_read_lock(), see + * release_task()->call_rcu(delayed_put_task_struct). + */ +SYSCALL_DEFINE0(getppid) +{ + int pid; + + rcu_read_lock(); + pid = task_tgid_vnr(rcu_dereference(current->real_parent)); + rcu_read_unlock(); + + return pid; +} + +SYSCALL_DEFINE0(getuid) +{ + /* Only we change this so SMP safe */ + return from_kuid_munged(current_user_ns(), current_uid()); +} + +SYSCALL_DEFINE0(geteuid) +{ + /* Only we change this so SMP safe */ + return from_kuid_munged(current_user_ns(), current_euid()); +} + +SYSCALL_DEFINE0(getgid) +{ + /* Only we change this so SMP safe */ + return from_kgid_munged(current_user_ns(), current_gid()); +} + +SYSCALL_DEFINE0(getegid) +{ + /* Only we change this so SMP safe */ + return from_kgid_munged(current_user_ns(), current_egid()); +} + void do_sys_times(struct tms *tms) { cputime_t tgutime, tgstime, cutime, cstime; @@ -1785,13 +1851,26 @@ SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) return getrusage(current, who, ru); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) +{ + struct rusage r; + + if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && + who != RUSAGE_THREAD) + return -EINVAL; + + k_getrusage(current, who, &r); + return put_compat_rusage(&r, ru); +} +#endif + SYSCALL_DEFINE1(umask, int, mask) { mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); return mask; } -#ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; @@ -1985,17 +2064,12 @@ out: return error; } +#ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) { return put_user(me->clear_child_tid, tid_addr); } - -#else /* CONFIG_CHECKPOINT_RESTORE */ -static int prctl_set_mm(int opt, unsigned long addr, - unsigned long arg4, unsigned long arg5) -{ - return -EINVAL; -} +#else static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) { return -EINVAL; @@ -2245,3 +2319,148 @@ int orderly_poweroff(bool force) return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); + +/** + * do_sysinfo - fill in sysinfo struct + * @info: pointer to buffer to fill + */ +static int do_sysinfo(struct sysinfo *info) +{ + unsigned long mem_total, sav_total; + unsigned int mem_unit, bitcount; + struct timespec tp; + + memset(info, 0, sizeof(struct sysinfo)); + + ktime_get_ts(&tp); + monotonic_to_bootbased(&tp); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); + + info->procs = nr_threads; + + si_meminfo(info); + si_swapinfo(info); + + /* + * If the sum of all the available memory (i.e. ram + swap) + * is less than can be stored in a 32 bit unsigned long then + * we can be binary compatible with 2.2.x kernels. If not, + * well, in that case 2.2.x was broken anyways... + * + * -Erik Andersen <andersee@debian.org> + */ + + mem_total = info->totalram + info->totalswap; + if (mem_total < info->totalram || mem_total < info->totalswap) + goto out; + bitcount = 0; + mem_unit = info->mem_unit; + while (mem_unit > 1) { + bitcount++; + mem_unit >>= 1; + sav_total = mem_total; + mem_total <<= 1; + if (mem_total < sav_total) + goto out; + } + + /* + * If mem_total did not overflow, multiply all memory values by + * info->mem_unit and set it to 1. This leaves things compatible + * with 2.2.x, and also retains compatibility with earlier 2.4.x + * kernels... + */ + + info->mem_unit = 1; + info->totalram <<= bitcount; + info->freeram <<= bitcount; + info->sharedram <<= bitcount; + info->bufferram <<= bitcount; + info->totalswap <<= bitcount; + info->freeswap <<= bitcount; + info->totalhigh <<= bitcount; + info->freehigh <<= bitcount; + +out: + return 0; +} + +SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) +{ + struct sysinfo val; + + do_sysinfo(&val); + + if (copy_to_user(info, &val, sizeof(struct sysinfo))) + return -EFAULT; + + return 0; +} + +#ifdef CONFIG_COMPAT +struct compat_sysinfo { + s32 uptime; + u32 loads[3]; + u32 totalram; + u32 freeram; + u32 sharedram; + u32 bufferram; + u32 totalswap; + u32 freeswap; + u16 procs; + u16 pad; + u32 totalhigh; + u32 freehigh; + u32 mem_unit; + char _f[20-2*sizeof(u32)-sizeof(int)]; +}; + +COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) +{ + struct sysinfo s; + + do_sysinfo(&s); + + /* Check to see if any memory value is too large for 32-bit and scale + * down if needed + */ + if ((s.totalram >> 32) || (s.totalswap >> 32)) { + int bitcount = 0; + + while (s.mem_unit < PAGE_SIZE) { + s.mem_unit <<= 1; + bitcount++; + } + + s.totalram >>= bitcount; + s.freeram >>= bitcount; + s.sharedram >>= bitcount; + s.bufferram >>= bitcount; + s.totalswap >>= bitcount; + s.freeswap >>= bitcount; + s.totalhigh >>= bitcount; + s.freehigh >>= bitcount; + } + + if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || + __put_user(s.uptime, &info->uptime) || + __put_user(s.loads[0], &info->loads[0]) || + __put_user(s.loads[1], &info->loads[1]) || + __put_user(s.loads[2], &info->loads[2]) || + __put_user(s.totalram, &info->totalram) || + __put_user(s.freeram, &info->freeram) || + __put_user(s.sharedram, &info->sharedram) || + __put_user(s.bufferram, &info->bufferram) || + __put_user(s.totalswap, &info->totalswap) || + __put_user(s.freeswap, &info->freeswap) || + __put_user(s.procs, &info->procs) || + __put_user(s.totalhigh, &info->totalhigh) || + __put_user(s.freehigh, &info->freehigh) || + __put_user(s.mem_unit, &info->mem_unit)) + return -EFAULT; + + return 0; +} +#endif /* CONFIG_COMPAT */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 395084d4ce1..bfd6787b355 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -20,6 +20,7 @@ cond_syscall(sys_quotactl); cond_syscall(sys32_quotactl); cond_syscall(sys_acct); cond_syscall(sys_lookup_dcookie); +cond_syscall(compat_sys_lookup_dcookie); cond_syscall(sys_swapon); cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); @@ -155,7 +156,7 @@ cond_syscall(compat_sys_process_vm_writev); cond_syscall(sys_pciconfig_read); cond_syscall(sys_pciconfig_write); cond_syscall(sys_pciconfig_iobase); -cond_syscall(sys32_ipc); +cond_syscall(compat_sys_s390_ipc); cond_syscall(ppc_rtas); cond_syscall(sys_spu_run); cond_syscall(sys_spu_create); diff --git a/kernel/time.c b/kernel/time.c index f8342a41efa..d3617dbd3dc 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -138,13 +138,14 @@ int persistent_clock_is_local; */ static inline void warp_clock(void) { - struct timespec adjust; + if (sys_tz.tz_minuteswest != 0) { + struct timespec adjust; - adjust = current_kernel_time(); - if (sys_tz.tz_minuteswest != 0) persistent_clock_is_local = 1; - adjust.tv_sec += sys_tz.tz_minuteswest * 60; - do_settimeofday(&adjust); + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); + } } /* diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 072bb066bb7..12ff13a838c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,13 +18,14 @@ #include <linux/rtc.h> #include "tick-internal.h" +#include "ntp_internal.h" /* * NTP timekeeping variables: + * + * Note: All of the NTP state is protected by the timekeeping locks. */ -DEFINE_RAW_SPINLOCK(ntp_lock); - /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; @@ -53,9 +54,6 @@ static int time_state = TIME_OK; /* clock status bits: */ static int time_status = STA_UNSYNC; -/* TAI offset (secs): */ -static long time_tai; - /* time adjustment (nsecs): */ static s64 time_offset; @@ -134,8 +132,6 @@ static inline void pps_reset_freq_interval(void) /** * pps_clear - Clears the PPS state variables - * - * Must be called while holding a write on the ntp_lock */ static inline void pps_clear(void) { @@ -150,8 +146,6 @@ static inline void pps_clear(void) /* Decrease pps_valid to indicate that another second has passed since * the last PPS signal. When it reaches 0, indicate that PPS signal is * missing. - * - * Must be called while holding a write on the ntp_lock */ static inline void pps_dec_valid(void) { @@ -346,10 +340,6 @@ static void ntp_update_offset(long offset) */ void ntp_clear(void) { - unsigned long flags; - - raw_spin_lock_irqsave(&ntp_lock, flags); - time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; @@ -362,20 +352,12 @@ void ntp_clear(void) /* Clear PPS state variables */ pps_clear(); - raw_spin_unlock_irqrestore(&ntp_lock, flags); - } u64 ntp_tick_length(void) { - unsigned long flags; - s64 ret; - - raw_spin_lock_irqsave(&ntp_lock, flags); - ret = tick_length; - raw_spin_unlock_irqrestore(&ntp_lock, flags); - return ret; + return tick_length; } @@ -393,9 +375,6 @@ int second_overflow(unsigned long secs) { s64 delta; int leap = 0; - unsigned long flags; - - raw_spin_lock_irqsave(&ntp_lock, flags); /* * Leap second processing. If in leap-insert state at the end of the @@ -415,7 +394,6 @@ int second_overflow(unsigned long secs) else if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; - time_tai++; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); } @@ -425,7 +403,6 @@ int second_overflow(unsigned long secs) time_state = TIME_OK; else if ((secs + 1) % 86400 == 0) { leap = 1; - time_tai--; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); @@ -479,8 +456,6 @@ int second_overflow(unsigned long secs) time_adjust = 0; out: - raw_spin_unlock_irqrestore(&ntp_lock, flags); - return leap; } @@ -575,11 +550,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) time_status |= txc->status & ~STA_RONLY; } -/* - * Called with ntp_lock held, so we can access and modify - * all the global NTP state: - */ -static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) + +static inline void process_adjtimex_modes(struct timex *txc, + struct timespec *ts, + s32 *time_tai) { if (txc->modes & ADJ_STATUS) process_adj_status(txc, ts); @@ -613,7 +587,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts } if (txc->modes & ADJ_TAI && txc->constant > 0) - time_tai = txc->constant; + *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) ntp_update_offset(txc->offset); @@ -625,16 +599,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts ntp_update_frequency(); } -/* - * adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. + + +/** + * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -int do_adjtimex(struct timex *txc) +int ntp_validate_timex(struct timex *txc) { - struct timespec ts; - int result; - - /* Validate the data before disabling interrupts */ if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) @@ -646,7 +617,6 @@ int do_adjtimex(struct timex *txc) /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; - /* * if the quartz is off by more than 10% then * something is VERY wrong! @@ -657,22 +627,20 @@ int do_adjtimex(struct timex *txc) return -EINVAL; } - if (txc->modes & ADJ_SETOFFSET) { - struct timespec delta; - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; - if (!capable(CAP_SYS_TIME)) - return -EPERM; - if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - result = timekeeping_inject_offset(&delta); - if (result) - return result; - } + if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) + return -EPERM; - getnstimeofday(&ts); + return 0; +} - raw_spin_lock_irq(&ntp_lock); + +/* + * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) +{ + int result; if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -687,7 +655,7 @@ int do_adjtimex(struct timex *txc) /* If there are input parameters, then process them: */ if (txc->modes) - process_adjtimex_modes(txc, &ts); + process_adjtimex_modes(txc, ts, time_tai); txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); @@ -709,15 +677,13 @@ int do_adjtimex(struct timex *txc) txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; txc->tick = tick_usec; - txc->tai = time_tai; + txc->tai = *time_tai; /* fill PPS status fields */ pps_fill_timex(txc); - raw_spin_unlock_irq(&ntp_lock); - - txc->time.tv_sec = ts.tv_sec; - txc->time.tv_usec = ts.tv_nsec; + txc->time.tv_sec = ts->tv_sec; + txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; @@ -894,7 +860,7 @@ static void hardpps_update_phase(long error) } /* - * hardpps() - discipline CPU clock oscillator to external PPS signal + * __hardpps() - discipline CPU clock oscillator to external PPS signal * * This routine is called at each PPS signal arrival in order to * discipline the CPU clock oscillator to the PPS signal. It takes two @@ -905,15 +871,13 @@ static void hardpps_update_phase(long error) * This code is based on David Mills's reference nanokernel * implementation. It was mostly rewritten but keeps the same idea. */ -void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) { struct pps_normtime pts_norm, freq_norm; unsigned long flags; pts_norm = pps_normalize_ts(*phase_ts); - raw_spin_lock_irqsave(&ntp_lock, flags); - /* clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -925,7 +889,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) * just start the frequency interval */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; - raw_spin_unlock_irqrestore(&ntp_lock, flags); return; } @@ -940,7 +903,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - raw_spin_unlock_irqrestore(&ntp_lock, flags); pr_err("hardpps: PPSJITTER: bad pulse\n"); return; } @@ -957,10 +919,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) hardpps_update_phase(pts_norm.nsec); - raw_spin_unlock_irqrestore(&ntp_lock, flags); } -EXPORT_SYMBOL(hardpps); - #endif /* CONFIG_NTP_PPS */ static int __init ntp_tick_adj_setup(char *str) diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h new file mode 100644 index 00000000000..1950cb4ca2a --- /dev/null +++ b/kernel/time/ntp_internal.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_NTP_INTERNAL_H +#define _LINUX_NTP_INTERNAL_H + +extern void ntp_init(void); +extern void ntp_clear(void); +/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ +extern u64 ntp_tick_length(void); +extern int second_overflow(unsigned long secs); +extern int ntp_validate_timex(struct timex *); +extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); +extern void __hardpps(const struct timespec *, const struct timespec *); +#endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 40c10502c9e..206bbfb34e0 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -28,9 +28,8 @@ */ static struct tick_device tick_broadcast_device; -/* FIXME: Use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); -static DECLARE_BITMAP(tmpmask, NR_CPUS); +static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tmpmask; static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void) struct cpumask *tick_get_broadcast_mask(void) { - return to_cpumask(tick_broadcast_mask); + return tick_broadcast_mask; } /* @@ -67,6 +66,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) */ int tick_check_broadcast_device(struct clock_event_device *dev) { + struct clock_event_device *cur = tick_broadcast_device.evtdev; + if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || @@ -74,9 +75,21 @@ int tick_check_broadcast_device(struct clock_event_device *dev) return 0; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); + if (cur) + cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(dev); + /* + * Inform all cpus about this. We might be in a situation + * where we did not switch to oneshot mode because the per cpu + * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack + * of a oneshot capable broadcast device. Without that + * notification the systems stays stuck in periodic mode + * forever. + */ + if (dev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_clock_notify(); return 1; } @@ -124,7 +137,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); + cpumask_set_cpu(cpu, tick_broadcast_mask); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; } else { @@ -135,7 +148,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { int cpu = smp_processor_id(); - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_mask); tick_broadcast_clear_oneshot(cpu); } else { tick_device_setup_broadcast_func(dev); @@ -199,9 +212,8 @@ static void tick_do_periodic_broadcast(void) { raw_spin_lock(&tick_broadcast_lock); - cpumask_and(to_cpumask(tmpmask), - cpu_online_mask, tick_get_broadcast_mask()); - tick_do_broadcast(to_cpumask(tmpmask)); + cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); + tick_do_broadcast(tmpmask); raw_spin_unlock(&tick_broadcast_lock); } @@ -264,13 +276,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason) if (!tick_device_is_functional(dev)) goto out; - bc_stopped = cpumask_empty(tick_get_broadcast_mask()); + bc_stopped = cpumask_empty(tick_broadcast_mask); switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); @@ -280,8 +291,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: if (!tick_broadcast_force && - cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -289,7 +299,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) break; } - if (cpumask_empty(tick_get_broadcast_mask())) { + if (cpumask_empty(tick_broadcast_mask)) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { @@ -338,10 +348,10 @@ void tick_shutdown_broadcast(unsigned int *cpup) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_mask); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { - if (bc && cpumask_empty(tick_get_broadcast_mask())) + if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } @@ -377,13 +387,13 @@ int tick_resume_broadcast(void) switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(bc); broadcast = cpumask_test_cpu(smp_processor_id(), - tick_get_broadcast_mask()); + tick_broadcast_mask); break; case TICKDEV_MODE_ONESHOT: - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) broadcast = tick_resume_broadcast_oneshot(bc); break; } @@ -396,25 +406,58 @@ int tick_resume_broadcast(void) #ifdef CONFIG_TICK_ONESHOT -/* FIXME: use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); +static cpumask_var_t tick_broadcast_oneshot_mask; +static cpumask_var_t tick_broadcast_pending_mask; +static cpumask_var_t tick_broadcast_force_mask; /* * Exposed for debugging: see timer_list.c */ struct cpumask *tick_get_broadcast_oneshot_mask(void) { - return to_cpumask(tick_broadcast_oneshot_mask); + return tick_broadcast_oneshot_mask; } -static int tick_broadcast_set_event(ktime_t expires, int force) +/* + * Called before going idle with interrupts disabled. Checks whether a + * broadcast event from the other core is about to happen. We detected + * that in tick_broadcast_oneshot_control(). The callsite can use this + * to avoid a deep idle transition as we are about to get the + * broadcast IPI right away. + */ +int tick_check_broadcast_expired(void) { - struct clock_event_device *bc = tick_broadcast_device.evtdev; + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +} + +/* + * Set broadcast interrupt affinity + */ +static void tick_broadcast_set_affinity(struct clock_event_device *bc, + const struct cpumask *cpumask) +{ + if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) + return; + + if (cpumask_equal(bc->cpumask, cpumask)) + return; + + bc->cpumask = cpumask; + irq_set_affinity(bc->irq, bc->cpumask); +} + +static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, + ktime_t expires, int force) +{ + int ret; if (bc->mode != CLOCK_EVT_MODE_ONESHOT) clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - return clockevents_program_event(bc, expires, force); + ret = clockevents_program_event(bc, expires, force); + if (!ret) + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); + return ret; } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) @@ -429,7 +472,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) */ void tick_check_oneshot_broadcast(int cpu) { - if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { + if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); @@ -443,27 +486,39 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; ktime_t now, next_event; - int cpu; + int cpu, next_cpu = 0; raw_spin_lock(&tick_broadcast_lock); again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; - cpumask_clear(to_cpumask(tmpmask)); + cpumask_clear(tmpmask); now = ktime_get(); /* Find all expired events */ - for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { + for_each_cpu(cpu, tick_broadcast_oneshot_mask) { td = &per_cpu(tick_cpu_device, cpu); - if (td->evtdev->next_event.tv64 <= now.tv64) - cpumask_set_cpu(cpu, to_cpumask(tmpmask)); - else if (td->evtdev->next_event.tv64 < next_event.tv64) + if (td->evtdev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, tmpmask); + /* + * Mark the remote cpu in the pending mask, so + * it can avoid reprogramming the cpu local + * timer in tick_broadcast_oneshot_control(). + */ + cpumask_set_cpu(cpu, tick_broadcast_pending_mask); + } else if (td->evtdev->next_event.tv64 < next_event.tv64) { next_event.tv64 = td->evtdev->next_event.tv64; + next_cpu = cpu; + } } + /* Take care of enforced broadcast requests */ + cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); + cpumask_clear(tick_broadcast_force_mask); + /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(to_cpumask(tmpmask)); + tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: @@ -480,7 +535,7 @@ again: * Rearm the broadcast device. If event expired, * repeat the above */ - if (tick_broadcast_set_event(next_event, 0)) + if (tick_broadcast_set_event(dev, next_cpu, next_event, 0)) goto again; } raw_spin_unlock(&tick_broadcast_lock); @@ -495,6 +550,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags; + ktime_t now; int cpu; /* @@ -519,21 +575,84 @@ void tick_broadcast_oneshot_control(unsigned long reason) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); - if (dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(dev->next_event, 1); + /* + * We only reprogram the broadcast timer if we + * did not mark ourself in the force mask and + * if the cpu local event is earlier than the + * broadcast event. If the current CPU is in + * the force mask, then we are going to be + * woken by the IPI right away. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && + dev->next_event.tv64 < bc->next_event.tv64) + tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } } else { - if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_clear_cpu(cpu, - tick_get_broadcast_oneshot_mask()); + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - if (dev->next_event.tv64 != KTIME_MAX) - tick_program_event(dev->next_event, 1); + if (dev->next_event.tv64 == KTIME_MAX) + goto out; + /* + * The cpu which was handling the broadcast + * timer marked this cpu in the broadcast + * pending mask and fired the broadcast + * IPI. So we are going to handle the expired + * event anyway via the broadcast IPI + * handler. No need to reprogram the timer + * with an already expired event. + */ + if (cpumask_test_and_clear_cpu(cpu, + tick_broadcast_pending_mask)) + goto out; + + /* + * If the pending bit is not set, then we are + * either the CPU handling the broadcast + * interrupt or we got woken by something else. + * + * We are not longer in the broadcast mask, so + * if the cpu local expiry time is already + * reached, we would reprogram the cpu local + * timer with an already expired event. + * + * This can lead to a ping-pong when we return + * to idle and therefor rearm the broadcast + * timer before the cpu local timer was able + * to fire. This happens because the forced + * reprogramming makes sure that the event + * will happen in the future and depending on + * the min_delta setting this might be far + * enough out that the ping-pong starts. + * + * If the cpu local next_event has expired + * then we know that the broadcast timer + * next_event has expired as well and + * broadcast is about to be handled. So we + * avoid reprogramming and enforce that the + * broadcast handler, which did not run yet, + * will invoke the cpu local handler. + * + * We cannot call the handler directly from + * here, because we might be in a NOHZ phase + * and we did not go through the irq_enter() + * nohz fixups. + */ + now = ktime_get(); + if (dev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, tick_broadcast_force_mask); + goto out; + } + /* + * We got woken by something else. Reprogram + * the cpu local timer device. + */ + tick_program_event(dev->next_event, 1); } } +out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -544,7 +663,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) */ static void tick_broadcast_clear_oneshot(int cpu) { - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, @@ -583,17 +702,16 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) * oneshot_mask bits for those and program the * broadcast device to fire. */ - cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); - cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); - cpumask_or(tick_get_broadcast_oneshot_mask(), - tick_get_broadcast_oneshot_mask(), - to_cpumask(tmpmask)); + cpumask_copy(tmpmask, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tmpmask); + cpumask_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, tmpmask); - if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + if (was_periodic && !cpumask_empty(tmpmask)) { clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - tick_broadcast_init_next_event(to_cpumask(tmpmask), + tick_broadcast_init_next_event(tmpmask, tick_next_period); - tick_broadcast_set_event(tick_next_period, 1); + tick_broadcast_set_event(bc, cpu, tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; } else { @@ -641,7 +759,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) * Clear the broadcast mask flag for the dead cpu, but do not * stop the broadcast device! */ - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -665,3 +783,14 @@ bool tick_broadcast_oneshot_available(void) } #endif + +void __init tick_broadcast_init(void) +{ + alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + alloc_cpumask_var(&tmpmask, GFP_NOWAIT); +#ifdef CONFIG_TICK_ONESHOT + alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); +#endif +} diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 83f2bd96716..5d3fb100bc0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -326,6 +326,7 @@ static void tick_shutdown(unsigned int *cpup) */ dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); + dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } raw_spin_unlock_irqrestore(&tick_device_lock, flags); @@ -419,4 +420,5 @@ static struct notifier_block tick_notifier = { void __init tick_init(void) { clockevents_register_notifier(&tick_notifier); + tick_broadcast_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index cf3e59ed6dc..f0299eae460 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,6 +4,8 @@ #include <linux/hrtimer.h> #include <linux/tick.h> +extern seqlock_t jiffies_lock; + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 @@ -94,7 +96,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); extern void tick_suspend_broadcast(void); extern int tick_resume_broadcast(void); - +extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); @@ -119,6 +121,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline int tick_resume_broadcast(void) { return 0; } +static inline void tick_broadcast_init(void) { } /* * Set the periodic handler in non broadcast mode diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 07929c63357..bc67d4245e1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -730,8 +730,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (ratelimit < 10 && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); + pr_warn("NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); ratelimit++; } return false; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9a0bc98fbe1..98cd470bbe4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -23,8 +23,13 @@ #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> +#include "tick-internal.h" +#include "ntp_internal.h" static struct timekeeper timekeeper; +static DEFINE_RAW_SPINLOCK(timekeeper_lock); +static seqcount_t timekeeper_seq; +static struct timekeeper shadow_timekeeper; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -67,6 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) tk->wall_to_monotonic = wtm; set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); tk->offs_real = timespec_to_ktime(tmp); + tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); } static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) @@ -96,7 +102,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) old_clock = tk->clock; tk->clock = clock; - clock->cycle_last = clock->read(clock); + tk->cycle_last = clock->cycle_last = clock->read(clock); /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -201,8 +207,6 @@ static void update_pvclock_gtod(struct timekeeper *tk) /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener - * - * Must hold write on timekeeper.lock */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { @@ -210,11 +214,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb) unsigned long flags; int ret; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); - /* update timekeeping data */ update_pvclock_gtod(tk); - write_sequnlock_irqrestore(&tk->lock, flags); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } @@ -223,25 +226,22 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener - * - * Must hold write on timekeeper.lock */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { - struct timekeeper *tk = &timekeeper; unsigned long flags; int ret; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); - write_sequnlock_irqrestore(&tk->lock, flags); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); -/* must hold write on timekeeper.lock */ -static void timekeeping_update(struct timekeeper *tk, bool clearntp) +/* must hold timekeeper_lock */ +static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) { if (clearntp) { tk->ntp_error = 0; @@ -249,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) } update_vsyscall(tk); update_pvclock_gtod(tk); + + if (mirror) + memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); } /** @@ -267,7 +270,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) clock = tk->clock; cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - clock->cycle_last = cycle_now; + tk->cycle_last = clock->cycle_last = cycle_now; tk->xtime_nsec += cycle_delta * tk->mult; @@ -294,12 +297,12 @@ int __getnstimeofday(struct timespec *ts) s64 nsecs = 0; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); @@ -335,11 +338,11 @@ ktime_t ktime_get(void) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -366,12 +369,12 @@ void ktime_get_ts(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; @@ -379,6 +382,50 @@ void ktime_get_ts(struct timespec *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts); + +/** + * timekeeping_clocktai - Returns the TAI time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void timekeeping_clocktai(struct timespec *ts) +{ + struct timekeeper *tk = &timekeeper; + unsigned long seq; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&timekeeper_seq); + + ts->tv_sec = tk->xtime_sec + tk->tai_offset; + nsecs = timekeeping_get_ns(tk); + + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + ts->tv_nsec = 0; + timespec_add_ns(ts, nsecs); + +} +EXPORT_SYMBOL(timekeeping_clocktai); + + +/** + * ktime_get_clocktai - Returns the TAI time of day in a ktime + * + * Returns the time of day in a ktime. + */ +ktime_t ktime_get_clocktai(void) +{ + struct timespec ts; + + timekeeping_clocktai(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL(ktime_get_clocktai); + #ifdef CONFIG_NTP_PPS /** @@ -399,7 +446,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); *ts_raw = tk->raw_time; ts_real->tv_sec = tk->xtime_sec; @@ -408,7 +455,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) nsecs_raw = timekeeping_get_ns_raw(tk); nsecs_real = timekeeping_get_ns(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -448,7 +495,8 @@ int do_settimeofday(const struct timespec *tv) if (!timespec_valid_strict(tv)) return -EINVAL; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); @@ -460,9 +508,10 @@ int do_settimeofday(const struct timespec *tv) tk_set_xtime(tk, tv); - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -487,7 +536,8 @@ int timekeeping_inject_offset(struct timespec *ts) if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); @@ -502,9 +552,10 @@ int timekeeping_inject_offset(struct timespec *ts) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -513,6 +564,52 @@ error: /* even if we error out, we forwarded the time, so call update */ } EXPORT_SYMBOL(timekeeping_inject_offset); + +/** + * timekeeping_get_tai_offset - Returns current TAI offset from UTC + * + */ +s32 timekeeping_get_tai_offset(void) +{ + struct timekeeper *tk = &timekeeper; + unsigned int seq; + s32 ret; + + do { + seq = read_seqcount_begin(&timekeeper_seq); + ret = tk->tai_offset; + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + return ret; +} + +/** + * __timekeeping_set_tai_offset - Lock free worker function + * + */ +static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) +{ + tk->tai_offset = tai_offset; + tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); +} + +/** + * timekeeping_set_tai_offset - Sets the current TAI offset from UTC + * + */ +void timekeeping_set_tai_offset(s32 tai_offset) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + __timekeeping_set_tai_offset(tk, tai_offset); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + clock_was_set(); +} + /** * change_clocksource - Swaps clocksources if a new one is available * @@ -526,7 +623,8 @@ static int change_clocksource(void *data) new = (struct clocksource *) data; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); if (!new->enable || new->enable(new) == 0) { @@ -535,9 +633,10 @@ static int change_clocksource(void *data) if (old->disable) old->disable(old); } - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return 0; } @@ -587,11 +686,11 @@ void getrawmonotonic(struct timespec *ts) s64 nsecs; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); nsecs = timekeeping_get_ns_raw(tk); *ts = tk->raw_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); timespec_add_ns(ts, nsecs); } @@ -607,11 +706,11 @@ int timekeeping_valid_for_hres(void) int ret; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return ret; } @@ -626,11 +725,11 @@ u64 timekeeping_max_deferment(void) u64 ret; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ret = tk->clock->max_idle_ns; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return ret; } @@ -693,11 +792,10 @@ void __init timekeeping_init(void) boot.tv_nsec = 0; } - seqlock_init(&tk->lock); - + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); ntp_init(); - write_seqlock_irqsave(&tk->lock, flags); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); @@ -716,7 +814,10 @@ void __init timekeeping_init(void) tmp.tv_nsec = 0; tk_set_sleep_time(tk, tmp); - write_sequnlock_irqrestore(&tk->lock, flags); + memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /* time in seconds when suspend began */ @@ -764,15 +865,17 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (has_persistent_clock()) return; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -788,26 +891,72 @@ void timekeeping_inject_sleeptime(struct timespec *delta) static void timekeeping_resume(void) { struct timekeeper *tk = &timekeeper; + struct clocksource *clock = tk->clock; unsigned long flags; - struct timespec ts; + struct timespec ts_new, ts_delta; + cycle_t cycle_now, cycle_delta; + bool suspendtime_found = false; - read_persistent_clock(&ts); + read_persistent_clock(&ts_new); clockevents_resume(); clocksource_resume(); - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + /* + * After system resumes, we need to calculate the suspended time and + * compensate it for the OS time. There are 3 sources that could be + * used: Nonstop clocksource during suspend, persistent clock and rtc + * device. + * + * One specific platform may have 1 or 2 or all of them, and the + * preference will be: + * suspend-nonstop clocksource -> persistent clock -> rtc + * The less preferred source will only be tried if there is no better + * usable source. The rtc part is handled separately in rtc core code. + */ + cycle_now = clock->read(clock); + if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && + cycle_now > clock->cycle_last) { + u64 num, max = ULLONG_MAX; + u32 mult = clock->mult; + u32 shift = clock->shift; + s64 nsec = 0; + + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { - ts = timespec_sub(ts, timekeeping_suspend_time); - __timekeeping_inject_sleeptime(tk, &ts); + /* + * "cycle_delta * mutl" may cause 64 bits overflow, if the + * suspended time is too long. In that case we need do the + * 64 bits math carefully + */ + do_div(max, mult); + if (cycle_delta > max) { + num = div64_u64(cycle_delta, max); + nsec = (((u64) max * mult) >> shift) * num; + cycle_delta -= num * max; + } + nsec += ((u64) cycle_delta * mult) >> shift; + + ts_delta = ns_to_timespec(nsec); + suspendtime_found = true; + } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); + suspendtime_found = true; } - /* re-base the last cycle value */ - tk->clock->cycle_last = tk->clock->read(tk->clock); + + if (suspendtime_found) + __timekeeping_inject_sleeptime(tk, &ts_delta); + + /* Re-base the last cycle value */ + tk->cycle_last = clock->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, false); - write_sequnlock_irqrestore(&tk->lock, flags); + timekeeping_update(tk, false, true); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); touch_softlockup_watchdog(); @@ -826,7 +975,8 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; @@ -849,7 +999,8 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -1099,6 +1250,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts)); + __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); + clock_was_set_delayed(); } } @@ -1116,15 +1269,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, u32 shift) { + cycle_t interval = tk->cycle_interval << shift; u64 raw_nsecs; /* If the offset is smaller then a shifted interval, do nothing */ - if (offset < tk->cycle_interval<<shift) + if (offset < interval) return offset; /* Accumulate one shifted interval */ - offset -= tk->cycle_interval << shift; - tk->clock->cycle_last += tk->cycle_interval << shift; + offset -= interval; + tk->cycle_last += interval; tk->xtime_nsec += tk->xtime_interval << shift; accumulate_nsecs_to_secs(tk); @@ -1181,27 +1335,28 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) static void update_wall_time(void) { struct clocksource *clock; - struct timekeeper *tk = &timekeeper; + struct timekeeper *real_tk = &timekeeper; + struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; unsigned long flags; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) goto out; - clock = tk->clock; + clock = real_tk->clock; #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - offset = tk->cycle_interval; + offset = real_tk->cycle_interval; #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif /* Check if there's really nothing to do */ - if (offset < tk->cycle_interval) + if (offset < real_tk->cycle_interval) goto out; /* @@ -1238,11 +1393,24 @@ static void update_wall_time(void) */ accumulate_nsecs_to_secs(tk); - timekeeping_update(tk, false); - + write_seqcount_begin(&timekeeper_seq); + /* Update clock->cycle_last with the new value */ + clock->cycle_last = tk->cycle_last; + /* + * Update the real timekeeper. + * + * We could avoid this memcpy by switching pointers, but that + * requires changes to all other timekeeper usage sites as + * well, i.e. move the timekeeper pointer getter into the + * spinlocked/seqcount protected sections. And we trade this + * memcpy under the timekeeper_seq against one before we start + * updating. + */ + memcpy(real_tk, tk, sizeof(*tk)); + timekeeping_update(real_tk, false, false); + write_seqcount_end(&timekeeper_seq); out: - write_sequnlock_irqrestore(&tk->lock, flags); - + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /** @@ -1289,13 +1457,13 @@ void get_monotonic_boottime(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; sleep = tk->total_sleep_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_sec += tomono.tv_sec + sleep.tv_sec; ts->tv_nsec = 0; @@ -1354,10 +1522,10 @@ struct timespec current_kernel_time(void) unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); now = tk_xtime(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return now; } @@ -1370,11 +1538,11 @@ struct timespec get_monotonic_coarse(void) unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1405,11 +1573,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); *xtim = tk_xtime(tk); *wtom = tk->wall_to_monotonic; *sleep = tk->total_sleep_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1421,7 +1589,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * Returns current monotonic time and updates the offsets * Called from hrtimer_interupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, + ktime_t *offs_tai) { struct timekeeper *tk = &timekeeper; ktime_t now; @@ -1429,14 +1598,15 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) u64 secs, nsecs; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); secs = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; - } while (read_seqretry(&tk->lock, seq)); + *offs_tai = tk->offs_tai; + } while (read_seqcount_retry(&timekeeper_seq, seq)); now = ktime_add_ns(ktime_set(secs, 0), nsecs); now = ktime_sub(now, *offs_real); @@ -1454,15 +1624,79 @@ ktime_t ktime_get_monotonic_offset(void) struct timespec wtom; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); wtom = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return timespec_to_ktime(wtom); } EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); /** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + */ +int do_adjtimex(struct timex *txc) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + struct timespec ts; + s32 orig_tai, tai; + int ret; + + /* Validate the data before disabling interrupts */ + ret = ntp_validate_timex(txc); + if (ret) + return ret; + + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + ret = timekeeping_inject_offset(&delta); + if (ret) + return ret; + } + + getnstimeofday(&ts); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + orig_tai = tai = tk->tai_offset; + ret = __do_adjtimex(txc, &ts, &tai); + + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tk, tai); + clock_was_set_delayed(); + } + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return ret; +} + +#ifdef CONFIG_NTP_PPS +/** + * hardpps() - Accessor function to NTP __hardpps function + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + __hardpps(phase_ts, raw_ts); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} +EXPORT_SYMBOL(hardpps); +#endif + +/** * xtime_update() - advances the timekeeping infrastructure * @ticks: number of ticks, that have elapsed since the last call. * diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index af5a7e9f164..3bdf2832301 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -20,6 +20,13 @@ #include <asm/uaccess.h> + +struct timer_list_iter { + int cpu; + bool second_pass; + u64 now; +}; + typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); @@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - SEQ_printf(m, "\n"); SEQ_printf(m, "cpu: %d\n", cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { SEQ_printf(m, " clock %d:\n", i); @@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #undef P #undef P_ns + SEQ_printf(m, "\n"); } #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; - SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); @@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->event_handler); SEQ_printf(m, "\n"); SEQ_printf(m, " retries: %lu\n", dev->retries); + SEQ_printf(m, "\n"); } -static void timer_list_show_tickdevices(struct seq_file *m) +static void timer_list_show_tickdevices_header(struct seq_file *m) { - int cpu; - #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", @@ -246,47 +251,104 @@ static void timer_list_show_tickdevices(struct seq_file *m) #endif SEQ_printf(m, "\n"); #endif - for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu), cpu); - SEQ_printf(m, "\n"); } -#else -static void timer_list_show_tickdevices(struct seq_file *m) { } #endif +static inline void timer_list_header(struct seq_file *m, u64 now) +{ + SEQ_printf(m, "Timer List Version: v0.7\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + SEQ_printf(m, "\n"); +} + static int timer_list_show(struct seq_file *m, void *v) { + struct timer_list_iter *iter = v; + u64 now = ktime_to_ns(ktime_get()); + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + +void sysrq_timer_list_show(void) +{ u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.7\n"); - SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); - SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + timer_list_header(NULL, now); for_each_online_cpu(cpu) - print_cpu(m, cpu, now); + print_cpu(NULL, cpu, now); - SEQ_printf(m, "\n"); - timer_list_show_tickdevices(m); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + timer_list_show_tickdevices_header(NULL); + for_each_online_cpu(cpu) + print_tickdevice(NULL, tick_get_device(cpu), cpu); +#endif + return; +} - return 0; +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + + if (!*offset) { + iter->cpu = -1; + iter->now = ktime_to_ns(ktime_get()); + } else if (iter->cpu >= nr_cpu_ids) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS + if (!iter->second_pass) { + iter->cpu = -1; + iter->second_pass = true; + } else + return NULL; +#else + return NULL; +#endif + } + return iter; } -void sysrq_timer_list_show(void) +static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); + ++*offset; + return timer_list_start(file, offset); +} + +static void timer_list_stop(struct seq_file *seq, void *v) { - timer_list_show(NULL, NULL); } +static const struct seq_operations timer_list_sops = { + .start = timer_list_start, + .next = timer_list_next, + .stop = timer_list_stop, + .show = timer_list_show, +}; + static int timer_list_open(struct inode *inode, struct file *filp) { - return single_open(filp, timer_list_show, NULL); + return seq_open_private(filp, &timer_list_sops, + sizeof(struct timer_list_iter)); } static const struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = seq_release_private, }; static int __init init_timer_list_procfs(void) diff --git a/kernel/timer.c b/kernel/timer.c index 1b7489fdea4..a860bba3441 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1,7 +1,7 @@ /* * linux/kernel/timer.c * - * Kernel internal timers, basic process system calls + * Kernel internal timers * * Copyright (C) 1991, 1992 Linus Torvalds * @@ -41,6 +41,7 @@ #include <linux/sched.h> #include <linux/sched/sysctl.h> #include <linux/slab.h> +#include <linux/compat.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -1395,61 +1396,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif -/** - * sys_getpid - return the thread group id of the current process - * - * Note, despite the name, this returns the tgid not the pid. The tgid and - * the pid are identical unless CLONE_THREAD was specified on clone() in - * which case the tgid is the same in all threads of the same group. - * - * This is SMP safe as current->tgid does not change. - */ -SYSCALL_DEFINE0(getpid) -{ - return task_tgid_vnr(current); -} - -/* - * Accessing ->real_parent is not SMP-safe, it could - * change from under us. However, we can use a stale - * value of ->real_parent under rcu_read_lock(), see - * release_task()->call_rcu(delayed_put_task_struct). - */ -SYSCALL_DEFINE0(getppid) -{ - int pid; - - rcu_read_lock(); - pid = task_tgid_vnr(rcu_dereference(current->real_parent)); - rcu_read_unlock(); - - return pid; -} - -SYSCALL_DEFINE0(getuid) -{ - /* Only we change this so SMP safe */ - return from_kuid_munged(current_user_ns(), current_uid()); -} - -SYSCALL_DEFINE0(geteuid) -{ - /* Only we change this so SMP safe */ - return from_kuid_munged(current_user_ns(), current_euid()); -} - -SYSCALL_DEFINE0(getgid) -{ - /* Only we change this so SMP safe */ - return from_kgid_munged(current_user_ns(), current_gid()); -} - -SYSCALL_DEFINE0(getegid) -{ - /* Only we change this so SMP safe */ - return from_kgid_munged(current_user_ns(), current_egid()); -} - static void process_timeout(unsigned long __data) { wake_up_process((struct task_struct *)__data); @@ -1557,91 +1503,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -/* Thread ID - the internal kernel "pid" */ -SYSCALL_DEFINE0(gettid) -{ - return task_pid_vnr(current); -} - -/** - * do_sysinfo - fill in sysinfo struct - * @info: pointer to buffer to fill - */ -int do_sysinfo(struct sysinfo *info) -{ - unsigned long mem_total, sav_total; - unsigned int mem_unit, bitcount; - struct timespec tp; - - memset(info, 0, sizeof(struct sysinfo)); - - ktime_get_ts(&tp); - monotonic_to_bootbased(&tp); - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - - get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); - - info->procs = nr_threads; - - si_meminfo(info); - si_swapinfo(info); - - /* - * If the sum of all the available memory (i.e. ram + swap) - * is less than can be stored in a 32 bit unsigned long then - * we can be binary compatible with 2.2.x kernels. If not, - * well, in that case 2.2.x was broken anyways... - * - * -Erik Andersen <andersee@debian.org> - */ - - mem_total = info->totalram + info->totalswap; - if (mem_total < info->totalram || mem_total < info->totalswap) - goto out; - bitcount = 0; - mem_unit = info->mem_unit; - while (mem_unit > 1) { - bitcount++; - mem_unit >>= 1; - sav_total = mem_total; - mem_total <<= 1; - if (mem_total < sav_total) - goto out; - } - - /* - * If mem_total did not overflow, multiply all memory values by - * info->mem_unit and set it to 1. This leaves things compatible - * with 2.2.x, and also retains compatibility with earlier 2.4.x - * kernels... - */ - - info->mem_unit = 1; - info->totalram <<= bitcount; - info->freeram <<= bitcount; - info->sharedram <<= bitcount; - info->bufferram <<= bitcount; - info->totalswap <<= bitcount; - info->freeswap <<= bitcount; - info->totalhigh <<= bitcount; - info->freehigh <<= bitcount; - -out: - return 0; -} - -SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) -{ - struct sysinfo val; - - do_sysinfo(&val); - - if (copy_to_user(info, &val, sizeof(struct sysinfo))) - return -EFAULT; - - return 0; -} - static int __cpuinit init_timers_cpu(int cpu) { int j; diff --git a/kernel/uid16.c b/kernel/uid16.c index d7948eb1022..f6c83d7ef00 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -18,67 +18,43 @@ SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { - long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, user, group); - return ret; + return sys_chown(filename, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { - long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, user, group); - return ret; + return sys_lchown(filename, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) { - long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, fd, user, group); - return ret; + return sys_fchown(fd, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) { - long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, rgid, egid); - return ret; + return sys_setregid(low2highgid(rgid), low2highgid(egid)); } SYSCALL_DEFINE1(setgid16, old_gid_t, gid) { - long ret = sys_setgid(low2highgid(gid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, gid); - return ret; + return sys_setgid(low2highgid(gid)); } SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) { - long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, ruid, euid); - return ret; + return sys_setreuid(low2highuid(ruid), low2highuid(euid)); } SYSCALL_DEFINE1(setuid16, old_uid_t, uid) { - long ret = sys_setuid(low2highuid(uid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, uid); - return ret; + return sys_setuid(low2highuid(uid)); } SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) { - long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), + return sys_setresuid(low2highuid(ruid), low2highuid(euid), low2highuid(suid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, ruid, euid, suid); - return ret; } SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) @@ -100,11 +76,8 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) { - long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), + return sys_setresgid(low2highgid(rgid), low2highgid(egid), low2highgid(sgid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, rgid, egid, sgid); - return ret; } @@ -127,18 +100,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) { - long ret = sys_setfsuid(low2highuid(uid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, uid); - return ret; + return sys_setfsuid(low2highuid(uid)); } SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) { - long ret = sys_setfsgid(low2highgid(gid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, gid); - return ret; + return sys_setfsgid(low2highgid(gid)); } static int groups16_to_user(old_gid_t __user *grouplist, diff --git a/kernel/user.c b/kernel/user.c index 8e635a18ab5..69b4c3d48cd 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,7 +16,7 @@ #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> /* * userns count is 1 for root user, 1 for init_uts_ns, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e134d8f365d..d8c30db06c5 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,7 +9,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> diff --git a/kernel/utsname.c b/kernel/utsname.c index a47fc5de311..2fc8576efaa 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -15,7 +15,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/user_namespace.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> static struct uts_namespace *create_uts_ns(void) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 154aa12af48..4aa9f5bc6b2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -46,6 +46,7 @@ #include <linux/rculist.h> #include <linux/nodemask.h> #include <linux/moduleparam.h> +#include <linux/uaccess.h> #include "workqueue_internal.h" @@ -2197,6 +2198,7 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; + worker->desc_valid = false; pwq_dec_nr_in_flight(pwq, work_color); } @@ -4365,6 +4367,83 @@ unsigned int work_busy(struct work_struct *work) } EXPORT_SYMBOL_GPL(work_busy); +/** + * set_worker_desc - set description for the current work item + * @fmt: printf-style format string + * @...: arguments for the format string + * + * This function can be called by a running work function to describe what + * the work item is about. If the worker task gets dumped, this + * information will be printed out together to help debugging. The + * description can be at most WORKER_DESC_LEN including the trailing '\0'. + */ +void set_worker_desc(const char *fmt, ...) +{ + struct worker *worker = current_wq_worker(); + va_list args; + + if (worker) { + va_start(args, fmt); + vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); + va_end(args); + worker->desc_valid = true; + } +} + +/** + * print_worker_info - print out worker information and description + * @log_lvl: the log level to use when printing + * @task: target task + * + * If @task is a worker and currently executing a work item, print out the + * name of the workqueue being serviced and worker description set with + * set_worker_desc() by the currently executing work item. + * + * This function can be safely called on any task as long as the + * task_struct itself is accessible. While safe, this function isn't + * synchronized and may print out mixups or garbages of limited length. + */ +void print_worker_info(const char *log_lvl, struct task_struct *task) +{ + work_func_t *fn = NULL; + char name[WQ_NAME_LEN] = { }; + char desc[WORKER_DESC_LEN] = { }; + struct pool_workqueue *pwq = NULL; + struct workqueue_struct *wq = NULL; + bool desc_valid = false; + struct worker *worker; + + if (!(task->flags & PF_WQ_WORKER)) + return; + + /* + * This function is called without any synchronization and @task + * could be in any state. Be careful with dereferences. + */ + worker = probe_kthread_data(task); + + /* + * Carefully copy the associated workqueue's workfn and name. Keep + * the original last '\0' in case the original contains garbage. + */ + probe_kernel_read(&fn, &worker->current_func, sizeof(fn)); + probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq)); + probe_kernel_read(&wq, &pwq->wq, sizeof(wq)); + probe_kernel_read(name, wq->name, sizeof(name) - 1); + + /* copy worker description */ + probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid)); + if (desc_valid) + probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); + + if (fn || name[0] || desc[0]) { + printk("%sWorkqueue: %s %pf", log_lvl, name, fn); + if (desc[0]) + pr_cont(" (%s)", desc); + pr_cont("\n"); + } +} + /* * CPU hotplug. * diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 84ab6e1dc6f..ad83c96b2ec 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -29,15 +29,25 @@ struct worker { struct work_struct *current_work; /* L: work being processed */ work_func_t current_func; /* L: current_work's fn */ struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + bool desc_valid; /* ->desc is valid */ struct list_head scheduled; /* L: scheduled works */ + + /* 64 bytes boundary on 64bit, 32 on 32bit */ + struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ /* L: for rescuers */ - /* 64 bytes boundary on 64bit, 32 on 32bit */ + unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ + /* + * Opaque string set with work_set_desc(). Printed out with task + * dump for debugging - WARN, BUG, panic or sysrq. + */ + char desc[WORKER_DESC_LEN]; + /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; |