diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 6 | ||||
-rw-r--r-- | kernel/audit.c | 2 | ||||
-rw-r--r-- | kernel/audit.h | 7 | ||||
-rw-r--r-- | kernel/audit_watch.c | 3 | ||||
-rw-r--r-- | kernel/auditfilter.c | 65 | ||||
-rw-r--r-- | kernel/auditsc.c | 217 | ||||
-rw-r--r-- | kernel/events/core.c | 21 | ||||
-rw-r--r-- | kernel/fork.c | 13 | ||||
-rw-r--r-- | kernel/irq/irqdomain.c | 33 | ||||
-rw-r--r-- | kernel/kmod.c | 7 | ||||
-rw-r--r-- | kernel/kthread.c | 1 | ||||
-rw-r--r-- | kernel/rcutree.c | 21 | ||||
-rw-r--r-- | kernel/rcutree.h | 6 | ||||
-rw-r--r-- | kernel/sched/core.c | 71 | ||||
-rw-r--r-- | kernel/time.c | 2 | ||||
-rw-r--r-- | kernel/time/Kconfig | 4 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 118 | ||||
-rw-r--r-- | kernel/time/jiffies.c | 32 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 2 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 117 | ||||
-rw-r--r-- | kernel/timer.c | 10 |
21 files changed, 465 insertions, 293 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 6cd7529c9e6..051e071a06e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, } } -static int acct_on(char *name) +static int acct_on(struct filename *pathname) { struct file *file; struct vfsmount *mnt; @@ -201,7 +201,7 @@ static int acct_on(char *name) struct bsd_acct_struct *acct = NULL; /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); + file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); if (IS_ERR(file)) return PTR_ERR(file); @@ -260,7 +260,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) return -EPERM; if (name) { - char *tmp = getname(name); + struct filename *tmp = getname(name); if (IS_ERR(tmp)) return (PTR_ERR(tmp)); error = acct_on(tmp); diff --git a/kernel/audit.c b/kernel/audit.c index 4d0ceede331..40414e9143d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1440,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link) ab = audit_log_start(current->audit_context, GFP_KERNEL, AUDIT_ANOM_LINK); + if (!ab) + return; audit_log_format(ab, "op=%s action=denied", operation); audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); diff --git a/kernel/audit.h b/kernel/audit.h index 9eb3d79482b..d51cba868e1 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -74,12 +74,15 @@ static inline int audit_hash_ino(u32 ino) return (ino & (AUDIT_INODE_BUCKETS-1)); } +/* Indicates that audit should log the full pathname. */ +#define AUDIT_NAME_FULL -1 + extern int audit_match_class(int class, unsigned syscall); extern int audit_comparator(const u32 left, const u32 op, const u32 right); extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); -extern int audit_compare_dname_path(const char *dname, const char *path, - int *dirlen); +extern int parent_len(const char *path); +extern int audit_compare_dname_path(const char *dname, const char *path, int plen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, const void *payload, int size); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 1c22ec3d87b..9a9ae6e3d29 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent, /* Run all of the watches on this parent looking for the one that * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path, NULL)) + if (audit_compare_dname_path(dname, owatch->path, + AUDIT_NAME_FULL)) continue; /* If the update involves invalidating rules, do the inode-based diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index c4bcdbaf4d4..7f19f23d38a 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1298,41 +1298,60 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) } } -/* Compare given dentry name with last component in given path, - * return of 0 indicates a match. */ -int audit_compare_dname_path(const char *dname, const char *path, - int *dirlen) +/** + * parent_len - find the length of the parent portion of a pathname + * @path: pathname of which to determine length + */ +int parent_len(const char *path) { - int dlen, plen; + int plen; const char *p; - if (!dname || !path) - return 1; - - dlen = strlen(dname); plen = strlen(path); - if (plen < dlen) - return 1; + + if (plen == 0) + return plen; /* disregard trailing slashes */ p = path + plen - 1; while ((*p == '/') && (p > path)) p--; - /* find last path component */ - p = p - dlen + 1; - if (p < path) + /* walk backward until we find the next slash or hit beginning */ + while ((*p != '/') && (p > path)) + p--; + + /* did we find a slash? Then increment to include it in path */ + if (*p == '/') + p++; + + return p - path; +} + +/** + * audit_compare_dname_path - compare given dentry name with last component in + * given path. Return of 0 indicates a match. + * @dname: dentry name that we're comparing + * @path: full pathname that we're comparing + * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL + * here indicates that we must compute this value. + */ +int audit_compare_dname_path(const char *dname, const char *path, int parentlen) +{ + int dlen, pathlen; + const char *p; + + dlen = strlen(dname); + pathlen = strlen(path); + if (pathlen < dlen) return 1; - else if (p > path) { - if (*--p != '/') - return 1; - else - p++; - } - /* return length of path's directory component */ - if (dirlen) - *dirlen = p - path; + parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen; + if (pathlen - parentlen != dlen) + return 1; + + p = path + parentlen; + return strncmp(p, dname, dlen); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f4a7756f999..2f186ed80c4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -81,9 +81,6 @@ * a name dynamically and also add those to the list anchored by names_list. */ #define AUDIT_NAMES 5 -/* Indicates that audit should log the full pathname. */ -#define AUDIT_NAME_FULL -1 - /* no execve audit message should be longer than this (userspace limits) */ #define MAX_EXECVE_AUDIT_LEN 7500 @@ -106,27 +103,29 @@ struct audit_cap_data { * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). * - * Further, in fs/namei.c:path_lookup() we store the inode and device. */ + * Further, in fs/namei.c:path_lookup() we store the inode and device. + */ struct audit_names { - struct list_head list; /* audit_context->names_list */ - const char *name; - unsigned long ino; - dev_t dev; - umode_t mode; - kuid_t uid; - kgid_t gid; - dev_t rdev; - u32 osid; - struct audit_cap_data fcap; - unsigned int fcap_ver; - int name_len; /* number of name's characters to log */ - bool name_put; /* call __putname() for this name */ + struct list_head list; /* audit_context->names_list */ + struct filename *name; + unsigned long ino; + dev_t dev; + umode_t mode; + kuid_t uid; + kgid_t gid; + dev_t rdev; + u32 osid; + struct audit_cap_data fcap; + unsigned int fcap_ver; + int name_len; /* number of name's characters to log */ + unsigned char type; /* record type */ + bool name_put; /* call __putname() for this name */ /* * This was an allocated audit_names and not from the array of * names allocated in the task audit context. Thus this name * should be freed on syscall exit */ - bool should_free; + bool should_free; }; struct audit_aux_data { @@ -998,7 +997,7 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count); list_for_each_entry(n, &context->names_list, list) { printk(KERN_ERR "names[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); + n->name, n->name->name ?: "(null)"); } dump_stack(); return; @@ -1555,7 +1554,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, case AUDIT_NAME_FULL: /* log the full path */ audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); + audit_log_untrustedstring(ab, n->name->name); break; case 0: /* name was specified as a relative path and the @@ -1565,7 +1564,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, default: /* log the name's directory component */ audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, + audit_log_n_untrustedstring(ab, n->name->name, n->name_len); } } else @@ -1995,7 +1994,8 @@ retry: #endif } -static struct audit_names *audit_alloc_name(struct audit_context *context) +static struct audit_names *audit_alloc_name(struct audit_context *context, + unsigned char type) { struct audit_names *aname; @@ -2010,6 +2010,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context) } aname->ino = (unsigned long)-1; + aname->type = type; list_add_tail(&aname->list, &context->names_list); context->name_count++; @@ -2020,13 +2021,36 @@ static struct audit_names *audit_alloc_name(struct audit_context *context) } /** + * audit_reusename - fill out filename with info from existing entry + * @uptr: userland ptr to pathname + * + * Search the audit_names list for the current audit context. If there is an + * existing entry with a matching "uptr" then return the filename + * associated with that audit_name. If not, return NULL. + */ +struct filename * +__audit_reusename(const __user char *uptr) +{ + struct audit_context *context = current->audit_context; + struct audit_names *n; + + list_for_each_entry(n, &context->names_list, list) { + if (!n->name) + continue; + if (n->name->uptr == uptr) + return n->name; + } + return NULL; +} + +/** * audit_getname - add a name to the list * @name: name to add * * Add a name to the list of audit names for this context. * Called from fs/namei.c:getname(). */ -void __audit_getname(const char *name) +void __audit_getname(struct filename *name) { struct audit_context *context = current->audit_context; struct audit_names *n; @@ -2040,13 +2064,19 @@ void __audit_getname(const char *name) return; } - n = audit_alloc_name(context); +#if AUDIT_DEBUG + /* The filename _must_ have a populated ->name */ + BUG_ON(!name->name); +#endif + + n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; n->name = name; n->name_len = AUDIT_NAME_FULL; n->name_put = true; + name->aname = n; if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); @@ -2059,7 +2089,7 @@ void __audit_getname(const char *name) * then we delay the putname until syscall exit. * Called from include/linux/fs.h:putname(). */ -void audit_putname(const char *name) +void audit_putname(struct filename *name) { struct audit_context *context = current->audit_context; @@ -2074,7 +2104,7 @@ void audit_putname(const char *name) list_for_each_entry(n, &context->names_list, list) printk(KERN_ERR "name[%d] = %p = %s\n", i, - n->name, n->name ?: "(null)"); + n->name, n->name->name ?: "(null)"); } #endif __putname(name); @@ -2088,8 +2118,8 @@ void audit_putname(const char *name) " put_count=%d\n", __FILE__, __LINE__, context->serial, context->major, - context->in_syscall, name, context->name_count, - context->put_count); + context->in_syscall, name->name, + context->name_count, context->put_count); dump_stack(); } } @@ -2132,13 +2162,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent } /** - * audit_inode - store the inode and device from a lookup + * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited - * - * Called from fs/namei.c:path_lookup(). + * @parent: does this dentry represent the parent? */ -void __audit_inode(const char *name, const struct dentry *dentry) +void __audit_inode(struct filename *name, const struct dentry *dentry, + unsigned int parent) { struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; @@ -2147,24 +2177,69 @@ void __audit_inode(const char *name, const struct dentry *dentry) if (!context->in_syscall) return; + if (!name) + goto out_alloc; + +#if AUDIT_DEBUG + /* The struct filename _must_ have a populated ->name */ + BUG_ON(!name->name); +#endif + /* + * If we have a pointer to an audit_names entry already, then we can + * just use it directly if the type is correct. + */ + n = name->aname; + if (n) { + if (parent) { + if (n->type == AUDIT_TYPE_PARENT || + n->type == AUDIT_TYPE_UNKNOWN) + goto out; + } else { + if (n->type != AUDIT_TYPE_PARENT) + goto out; + } + } + list_for_each_entry_reverse(n, &context->names_list, list) { - if (n->name && (n->name == name)) - goto out; + /* does the name pointer match? */ + if (!n->name || n->name->name != name->name) + continue; + + /* match the correct record type */ + if (parent) { + if (n->type == AUDIT_TYPE_PARENT || + n->type == AUDIT_TYPE_UNKNOWN) + goto out; + } else { + if (n->type != AUDIT_TYPE_PARENT) + goto out; + } } - /* unable to find the name from a previous getname() */ - n = audit_alloc_name(context); +out_alloc: + /* unable to find the name from a previous getname(). Allocate a new + * anonymous entry. + */ + n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); if (!n) return; out: + if (parent) { + n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; + n->type = AUDIT_TYPE_PARENT; + } else { + n->name_len = AUDIT_NAME_FULL; + n->type = AUDIT_TYPE_NORMAL; + } handle_path(dentry); audit_copy_inode(n, dentry, inode); } /** - * audit_inode_child - collect inode info for created/removed objects - * @dentry: dentry being audited + * __audit_inode_child - collect inode info for created/removed objects * @parent: inode of dentry parent + * @dentry: dentry being audited + * @type: AUDIT_TYPE_* value that we're looking for * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. @@ -2174,15 +2249,14 @@ out: * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ -void __audit_inode_child(const struct dentry *dentry, - const struct inode *parent) +void __audit_inode_child(const struct inode *parent, + const struct dentry *dentry, + const unsigned char type) { struct audit_context *context = current->audit_context; - const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; const char *dname = dentry->d_name.name; - struct audit_names *n; - int dirlen = 0; + struct audit_names *n, *found_parent = NULL, *found_child = NULL; if (!context->in_syscall) return; @@ -2190,62 +2264,65 @@ void __audit_inode_child(const struct dentry *dentry, if (inode) handle_one(inode); - /* parent is more likely, look for it first */ + /* look for a parent entry first */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name) + if (!n->name || n->type != AUDIT_TYPE_PARENT) continue; if (n->ino == parent->i_ino && - !audit_compare_dname_path(dname, n->name, &dirlen)) { - n->name_len = dirlen; /* update parent data in place */ - found_parent = n->name; - goto add_names; + !audit_compare_dname_path(dname, n->name->name, n->name_len)) { + found_parent = n; + break; } } - /* no matching parent, look for matching child */ + /* is there a matching child entry? */ list_for_each_entry(n, &context->names_list, list) { - if (!n->name) + /* can only match entries that have a name */ + if (!n->name || n->type != type) continue; - /* strcmp() is the more likely scenario */ - if (!strcmp(dname, n->name) || - !audit_compare_dname_path(dname, n->name, &dirlen)) { - if (inode) - audit_copy_inode(n, NULL, inode); - else - n->ino = (unsigned long)-1; - found_child = n->name; - goto add_names; + /* if we found a parent, make sure this one is a child of it */ + if (found_parent && (n->name != found_parent->name)) + continue; + + if (!strcmp(dname, n->name->name) || + !audit_compare_dname_path(dname, n->name->name, + found_parent ? + found_parent->name_len : + AUDIT_NAME_FULL)) { + found_child = n; + break; } } -add_names: if (!found_parent) { - n = audit_alloc_name(context); + /* create a new, "anonymous" parent record */ + n = audit_alloc_name(context, AUDIT_TYPE_PARENT); if (!n) return; audit_copy_inode(n, NULL, parent); } if (!found_child) { - n = audit_alloc_name(context); - if (!n) + found_child = audit_alloc_name(context, type); + if (!found_child) return; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { - n->name = found_parent; - n->name_len = AUDIT_NAME_FULL; + found_child->name = found_parent->name; + found_child->name_len = AUDIT_NAME_FULL; /* don't call __putname() */ - n->name_put = false; + found_child->name_put = false; } - - if (inode) - audit_copy_inode(n, NULL, inode); } + if (inode) + audit_copy_inode(found_child, dentry, inode); + else + found_child->ino = (unsigned long)-1; } EXPORT_SYMBOL_GPL(__audit_inode_child); diff --git a/kernel/events/core.c b/kernel/events/core.c index cda3ebd49e8..dbccf83c134 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -372,6 +372,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->unique_pmu != pmu) + continue; /* ensure we process each cpuctx once */ /* * perf_cgroup_events says at least one @@ -395,9 +397,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode) if (mode & PERF_CGROUP_SWIN) { WARN_ON_ONCE(cpuctx->cgrp); - /* set cgrp before ctxsw in to - * allow event_filter_match() to not - * have to pass task around + /* + * set cgrp before ctxsw in to allow + * event_filter_match() to not have to pass + * task around */ cpuctx->cgrp = perf_cgroup_from_task(task); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); @@ -4412,7 +4415,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_task_ctx(&cpuctx->ctx, task_event); @@ -4558,7 +4561,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_comm_ctx(&cpuctx->ctx, comm_event); @@ -4754,7 +4757,7 @@ got_name: rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->active_pmu != pmu) + if (cpuctx->unique_pmu != pmu) goto next; perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); @@ -5855,8 +5858,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - if (cpuctx->active_pmu == old_pmu) - cpuctx->active_pmu = pmu; + if (cpuctx->unique_pmu == old_pmu) + cpuctx->unique_pmu = pmu; } } @@ -5991,7 +5994,7 @@ skip_type: cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; INIT_LIST_HEAD(&cpuctx->rotation_list); - cpuctx->active_pmu = pmu; + cpuctx->unique_pmu = pmu; } got_cpu_context: diff --git a/kernel/fork.c b/kernel/fork.c index 1cd7d581b3b..8b20ab7d3aa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1584,7 +1584,7 @@ long do_fork(unsigned long clone_flags, * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ - if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { + if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) @@ -1634,6 +1634,17 @@ long do_fork(unsigned long clone_flags, return nr; } +#ifdef CONFIG_GENERIC_KERNEL_THREAD +/* + * Create a kernel thread. + */ +pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, + (unsigned long)arg, NULL, NULL); +} +#endif + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 49a77727db4..4e69e24d3d7 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, * @host_data: Controller private data pointer * * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. + * domain otherwise. For the legacy domain, IRQ descriptors will also + * be allocated. * * This is intended to implement the expected behaviour for most * interrupt controllers which is that a linear mapping should @@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) { - if (first_irq > 0) - return irq_domain_add_legacy(of_node, size, first_irq, 0, + if (first_irq > 0) { + int irq_base; + + if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { + /* + * Set the descriptor allocator to search for a + * 1-to-1 mapping, such as irq_alloc_desc_at(). + * Use of_node_to_nid() which is defined to + * numa_node_id() on platforms that have no custom + * implementation. + */ + irq_base = irq_alloc_descs(first_irq, first_irq, size, + of_node_to_nid(of_node)); + if (irq_base < 0) { + WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", + first_irq); + irq_base = first_irq; + } + } else + irq_base = first_irq; + + return irq_domain_add_legacy(of_node, size, irq_base, 0, ops, host_data); - else - return irq_domain_add_linear(of_node, size, ops, host_data); + } + + /* A linear domain is the default */ + return irq_domain_add_linear(of_node, size, ops, host_data); } /** diff --git a/kernel/kmod.c b/kernel/kmod.c index 6f99aead66c..1c317e38683 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -37,6 +37,7 @@ #include <linux/notifier.h> #include <linux/suspend.h> #include <linux/rwsem.h> +#include <linux/ptrace.h> #include <asm/uaccess.h> #include <trace/events/module.h> @@ -221,11 +222,13 @@ static int ____call_usermodehelper(void *data) retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); + if (!retval) + return 0; /* Exec failed? */ fail: sub_info->retval = retval; - return 0; + do_exit(0); } static int call_helper(void *data) @@ -292,7 +295,7 @@ static int wait_for_helper(void *data) } umh_complete(sub_info); - return 0; + do_exit(0); } /* This is run by khelper thread */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 146a6fa9682..29fb60caecb 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -16,6 +16,7 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> +#include <linux/ptrace.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4fb2376ddf0..74df86bd920 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -74,6 +74,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ + .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = #sname, \ } @@ -1197,7 +1198,7 @@ static int rcu_gp_init(struct rcu_state *rsp) raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ - get_online_cpus(); + mutex_lock(&rsp->onoff_mutex); /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1234,7 +1235,7 @@ static int rcu_gp_init(struct rcu_state *rsp) cond_resched(); } - put_online_cpus(); + mutex_unlock(&rsp->onoff_mutex); return 1; } @@ -1700,6 +1701,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ /* Exclude any attempts to start a new grace period. */ + mutex_lock(&rsp->onoff_mutex); raw_spin_lock_irqsave(&rsp->onofflock, flags); /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ @@ -1744,6 +1746,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) init_callback_list(rdp); /* Disallow further callbacks on this CPU. */ rdp->nxttail[RCU_NEXT_TAIL] = NULL; + mutex_unlock(&rsp->onoff_mutex); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -2648,6 +2651,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); + /* Exclude new grace periods. */ + mutex_lock(&rsp->onoff_mutex); + /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->beenonline = 1; /* We have now been online. */ @@ -2662,14 +2668,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - /* - * A new grace period might start here. If so, we won't be part - * of it, but that is OK, as we are currently in a quiescent state. - */ - - /* Exclude any attempts to start a new GP on large systems. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ - /* Add CPU to rcu_node bitmasks. */ rnp = rdp->mynode; mask = rdp->grpmask; @@ -2693,8 +2691,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); + local_irq_restore(flags); - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + mutex_unlock(&rsp->onoff_mutex); } static void __cpuinit rcu_prepare_cpu(int cpu) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5faf05d6832..a240f032848 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -394,11 +394,17 @@ struct rcu_state { struct rcu_head **orphan_donetail; /* Tail of above. */ long qlen_lazy; /* Number of lazy callbacks. */ long qlen; /* Total number of callbacks. */ + /* End of fields guarded by onofflock. */ + + struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ + struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ unsigned long n_barrier_done; /* ++ at start and end of */ /* _rcu_barrier(). */ + /* End of fields guarded by barrier_mutex. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1774723643..2d8927fda71 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -505,7 +505,7 @@ static inline void init_hrtick(void) #ifdef CONFIG_SMP #ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#define tsk_is_polling(t) 0 #endif void resched_task(struct task_struct *p) @@ -6122,6 +6122,17 @@ static void sched_init_numa(void) * numbers. */ + /* + * Here, we should temporarily reset sched_domains_numa_levels to 0. + * If it fails to allocate memory for array sched_domains_numa_masks[][], + * the array will contain less then 'level' members. This could be + * dangerous when we use it to iterate array sched_domains_numa_masks[][] + * in other functions. + * + * We reset it to 'level' at the end of this function. + */ + sched_domains_numa_levels = 0; + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -6176,11 +6187,68 @@ static void sched_init_numa(void) } sched_domain_topology = tl; + + sched_domains_numa_levels = level; +} + +static void sched_domains_numa_masks_set(int cpu) +{ + int i, j; + int node = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (node_distance(j, node) <= sched_domains_numa_distance[i]) + cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); + } + } +} + +static void sched_domains_numa_masks_clear(int cpu) +{ + int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } +} + +/* + * Update sched_domains_numa_masks[level][node] array when new cpus + * are onlined. + */ +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + sched_domains_numa_masks_set(cpu); + break; + + case CPU_DEAD: + sched_domains_numa_masks_clear(cpu); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; } #else static inline void sched_init_numa(void) { } + +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + return 0; +} #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -6629,6 +6697,7 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); + hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); diff --git a/kernel/time.c b/kernel/time.c index ba744cf8069..d226c6a3fd2 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -30,7 +30,7 @@ #include <linux/export.h> #include <linux/timex.h> #include <linux/capability.h> -#include <linux/clocksource.h> +#include <linux/timekeeper_internal.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <linux/security.h> diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fd42bd452b7..8601f0db126 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA config GENERIC_TIME_VSYSCALL bool +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL_OLD + bool + # ktime_t scalar 64bit nsec representation config KTIME_SCALAR bool diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index aa27d391bfc..f11d83b1294 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -37,7 +37,6 @@ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; - struct hrtimer timer; ktime_t (*gettime)(void); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -46,6 +45,8 @@ static struct alarm_base { static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); +static struct wakeup_source *ws; + #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; @@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { } * @base: pointer to the base where the timer is being run * @alarm: pointer to alarm being enqueued. * - * Adds alarm to a alarm_base timerqueue and if necessary sets - * an hrtimer to run. + * Adds alarm to a alarm_base timerqueue * * Must hold base->lock when calling. */ static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) { + if (alarm->state & ALARMTIMER_STATE_ENQUEUED) + timerqueue_del(&base->timerqueue, &alarm->node); + timerqueue_add(&base->timerqueue, &alarm->node); alarm->state |= ALARMTIMER_STATE_ENQUEUED; - - if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { - hrtimer_try_to_cancel(&base->timer); - hrtimer_start(&base->timer, alarm->node.expires, - HRTIMER_MODE_ABS); - } } /** - * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue + * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue * @base: pointer to the base where the timer is running * @alarm: pointer to alarm being removed * - * Removes alarm to a alarm_base timerqueue and if necessary sets - * a new timer to run. + * Removes alarm to a alarm_base timerqueue * * Must hold base->lock when calling. */ -static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) +static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) { - struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); - if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) return; timerqueue_del(&base->timerqueue, &alarm->node); alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - - if (next == &alarm->node) { - hrtimer_try_to_cancel(&base->timer); - next = timerqueue_getnext(&base->timerqueue); - if (!next) - return; - hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); - } } @@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) */ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { - struct alarm_base *base = container_of(timer, struct alarm_base, timer); - struct timerqueue_node *next; + struct alarm *alarm = container_of(timer, struct alarm, timer); + struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - ktime_t now; int ret = HRTIMER_NORESTART; int restart = ALARMTIMER_NORESTART; spin_lock_irqsave(&base->lock, flags); - now = base->gettime(); - while ((next = timerqueue_getnext(&base->timerqueue))) { - struct alarm *alarm; - ktime_t expired = next->expires; - - if (expired.tv64 > now.tv64) - break; - - alarm = container_of(next, struct alarm, node); - - timerqueue_del(&base->timerqueue, &alarm->node); - alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - - alarm->state |= ALARMTIMER_STATE_CALLBACK; - spin_unlock_irqrestore(&base->lock, flags); - if (alarm->function) - restart = alarm->function(alarm, now); - spin_lock_irqsave(&base->lock, flags); - alarm->state &= ~ALARMTIMER_STATE_CALLBACK; + alarmtimer_dequeue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); - if (restart != ALARMTIMER_NORESTART) { - timerqueue_add(&base->timerqueue, &alarm->node); - alarm->state |= ALARMTIMER_STATE_ENQUEUED; - } - } + if (alarm->function) + restart = alarm->function(alarm, base->gettime()); - if (next) { - hrtimer_set_expires(&base->timer, next->expires); + spin_lock_irqsave(&base->lock, flags); + if (restart != ALARMTIMER_NORESTART) { + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + alarmtimer_enqueue(base, alarm); ret = HRTIMER_RESTART; } spin_unlock_irqrestore(&base->lock, flags); @@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev) unsigned long flags; struct rtc_device *rtc; int i; + int ret; spin_lock_irqsave(&freezer_delta_lock, flags); min = freezer_delta; @@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev) if (min.tv64 == 0) return 0; - /* XXX - Should we enforce a minimum sleep time? */ - WARN_ON(min.tv64 < NSEC_PER_SEC); + if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + return -EBUSY; + } /* Setup an rtc timer to fire that far in the future */ rtc_timer_cancel(rtc, &rtctimer); @@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev) now = rtc_tm_to_ktime(tm); now = ktime_add(now, min); - rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); - - return 0; + /* Set alarm, if in the past reject suspend briefly to handle */ + ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); + if (ret < 0) + __pm_wakeup_event(ws, MSEC_PER_SEC); + return ret; } #else static int alarmtimer_suspend(struct device *dev) @@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); + hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + alarm->timer.function = alarmtimer_fired; alarm->function = function; alarm->type = type; alarm->state = ALARMTIMER_STATE_INACTIVE; @@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, * @alarm: ptr to alarm to set * @start: time to run the alarm */ -void alarm_start(struct alarm *alarm, ktime_t start) +int alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; + int ret; spin_lock_irqsave(&base->lock, flags); - if (alarmtimer_active(alarm)) - alarmtimer_remove(base, alarm); alarm->node.expires = start; alarmtimer_enqueue(base, alarm); + ret = hrtimer_start(&alarm->timer, alarm->node.expires, + HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); + return ret; } /** @@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - int ret = -1; - spin_lock_irqsave(&base->lock, flags); - - if (alarmtimer_callback_running(alarm)) - goto out; + int ret; - if (alarmtimer_is_queued(alarm)) { - alarmtimer_remove(base, alarm); - ret = 1; - } else - ret = 0; -out: + spin_lock_irqsave(&base->lock, flags); + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); return ret; } @@ -802,10 +773,6 @@ static int __init alarmtimer_init(void) for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); - hrtimer_init(&alarm_bases[i].timer, - alarm_bases[i].base_clockid, - HRTIMER_MODE_ABS); - alarm_bases[i].timer.function = alarmtimer_fired; } error = alarmtimer_rtc_interface_setup(); @@ -821,6 +788,7 @@ static int __init alarmtimer_init(void) error = PTR_ERR(pdev); goto out_drv; } + ws = wakeup_source_register("alarmtimer"); return 0; out_drv: diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 46da0537c10..6629bf7b528 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@ * requested HZ value. It is also not recommended * for "tick-less" systems. */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) +#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ) /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier * conversion, the .shift value could be zero. However @@ -95,3 +95,33 @@ struct clocksource * __init __weak clocksource_default_clock(void) { return &clocksource_jiffies; } + +struct clocksource refined_jiffies; + +int register_refined_jiffies(long cycles_per_second) +{ + u64 nsec_per_tick, shift_hz; + long cycles_per_tick; + + + + refined_jiffies = clocksource_jiffies; + refined_jiffies.name = "refined-jiffies"; + refined_jiffies.rating++; + + /* Calc cycles per tick */ + cycles_per_tick = (cycles_per_second + HZ/2)/HZ; + /* shift_hz stores hz<<8 for extra accuracy */ + shift_hz = (u64)cycles_per_second << 8; + shift_hz += cycles_per_tick/2; + do_div(shift_hz, cycles_per_tick); + /* Calculate nsec_per_tick using shift_hz */ + nsec_per_tick = (u64)NSEC_PER_SEC << 8; + nsec_per_tick += (u32)shift_hz/2; + do_div(nsec_per_tick, (u32)shift_hz); + + refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; + + clocksource_register(&refined_jiffies); + return 0; +} diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f423bdd035c..a4026088526 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -835,7 +835,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (ts->tick_stopped) { touch_softlockup_watchdog(); - if (idle_cpu(cpu)) + if (is_idle_task(current)) ts->idle_jiffies++; } update_process_times(user_mode(regs)); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5ce06a3fa91..e424970bb56 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -8,6 +8,7 @@ * */ +#include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/percpu.h> @@ -21,61 +22,6 @@ #include <linux/tick.h> #include <linux/stop_machine.h> -/* Structure holding internal timekeeping values. */ -struct timekeeper { - /* Current clocksource used for timekeeping. */ - struct clocksource *clock; - /* NTP adjusted clock multiplier */ - u32 mult; - /* The shift value of the current clocksource. */ - u32 shift; - /* Number of clock cycles in one NTP interval. */ - cycle_t cycle_interval; - /* Number of clock shifted nano seconds in one NTP interval. */ - u64 xtime_interval; - /* shifted nano seconds left over when rounding cycle_interval */ - s64 xtime_remainder; - /* Raw nano seconds accumulated per NTP interval. */ - u32 raw_interval; - - /* Current CLOCK_REALTIME time in seconds */ - u64 xtime_sec; - /* Clock shifted nano seconds */ - u64 xtime_nsec; - - /* Difference between accumulated time and NTP time in ntp - * shifted nano seconds. */ - s64 ntp_error; - /* Shift conversion between clock shifted nano seconds and - * ntp shifted nano seconds. */ - u32 ntp_error_shift; - - /* - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - * - * wall_to_monotonic is moved after resume from suspend for the - * monotonic time not to jump. We need to add total_sleep_time to - * wall_to_monotonic to get the real boot based time offset. - * - * - wall_to_monotonic is no longer the boot time, getboottime must be - * used instead. - */ - struct timespec wall_to_monotonic; - /* Offset clock monotonic -> clock realtime */ - ktime_t offs_real; - /* time spent in suspend */ - struct timespec total_sleep_time; - /* Offset clock monotonic -> clock boottime */ - ktime_t offs_boot; - /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ - struct timespec raw_time; - /* Seqlock for all timekeeper values */ - seqlock_t lock; -}; static struct timekeeper timekeeper; @@ -96,15 +42,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) } } -static struct timespec tk_xtime(struct timekeeper *tk) -{ - struct timespec ts; - - ts.tv_sec = tk->xtime_sec; - ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); - return ts; -} - static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec = ts->tv_sec; @@ -246,14 +183,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) /* must hold write on timekeeper.lock */ static void timekeeping_update(struct timekeeper *tk, bool clearntp) { - struct timespec xt; - if (clearntp) { tk->ntp_error = 0; ntp_clear(); } - xt = tk_xtime(tk); - update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); + update_vsyscall(tk); } /** @@ -1113,7 +1047,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = tk->raw_interval << shift; + raw_nsecs = (u64)tk->raw_interval << shift; raw_nsecs += tk->raw_time.tv_nsec; if (raw_nsecs >= NSEC_PER_SEC) { u64 raw_secs = raw_nsecs; @@ -1130,6 +1064,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, return offset; } +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ + s64 remainder; + + /* + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD + * users are removed, this can be killed. + */ + remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); + tk->xtime_nsec -= remainder; + tk->xtime_nsec += 1ULL << tk->shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + +} +#else +#define old_vsyscall_fixup(tk) +#endif + + + /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -1141,7 +1102,6 @@ static void update_wall_time(void) cycle_t offset; int shift = 0, maxshift; unsigned long flags; - s64 remainder; write_seqlock_irqsave(&tk->lock, flags); @@ -1183,20 +1143,11 @@ static void update_wall_time(void) /* correct the clock when NTP error is too big */ timekeeping_adjust(tk, offset); - /* - * Store only full nanoseconds into xtime_nsec after rounding - * it up and add the remainder to the error difference. - * XXX - This is necessary to avoid small 1ns inconsistnecies caused - * by truncating the remainder in vsyscalls. However, it causes - * additional work to be done in timekeeping_adjust(). Once - * the vsyscall implementations are converted to use xtime_nsec - * (shifted nanoseconds), this can be killed. - */ - remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); - tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1ULL << tk->shift; - tk->ntp_error += remainder << tk->ntp_error_shift; + * XXX This can be killed once everyone converts + * to the new update_vsyscall. + */ + old_vsyscall_fixup(tk); /* * Finally, make sure that after the rounding diff --git a/kernel/timer.c b/kernel/timer.c index d5de1b2292a..367d0085848 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64); #define TVR_SIZE (1 << TVR_BITS) #define TVN_MASK (TVN_SIZE - 1) #define TVR_MASK (TVR_SIZE - 1) +#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) struct tvec { struct list_head vec[TVN_SIZE]; @@ -359,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); } else { int i; - /* If the timeout is larger than 0xffffffff on 64-bit - * architectures then we use the maximum timeout: + /* If the timeout is larger than MAX_TVAL (on 64-bit + * architectures or with CONFIG_BASE_SMALL=1) then we + * use the maximum timeout. */ - if (idx > 0xffffffffUL) { - idx = 0xffffffffUL; + if (idx > MAX_TVAL) { + idx = MAX_TVAL; expires = idx + base->timer_jiffies; } i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; |