diff options
Diffstat (limited to 'kernel')
44 files changed, 1406 insertions, 804 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 4a3f28d2ca6..ea3b7b6191c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1456,6 +1456,27 @@ void audit_log_key(struct audit_buffer *ab, char *key) } /** + * audit_log_link_denied - report a link restriction denial + * @operation: specific link opreation + * @link: the path that triggered the restriction + */ +void audit_log_link_denied(const char *operation, struct path *link) +{ + struct audit_buffer *ab; + + ab = audit_log_start(current->audit_context, GFP_KERNEL, + AUDIT_ANOM_LINK); + audit_log_format(ab, "op=%s action=denied", operation); + audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_untrustedstring(ab, current->comm); + audit_log_d_path(ab, " path=", link); + audit_log_format(ab, " dev="); + audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id); + audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino); + audit_log_end(ab); +} + +/** * audit_log_end - end one audit record * @ab: the audit_buffer * diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 3a5ca582ba1..ed206fd88cc 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -250,7 +250,6 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); goto out; } @@ -259,7 +258,7 @@ static void untag_chunk(struct node *p) fsnotify_duplicate_mark(&new->mark, entry); if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { - free_chunk(new); + fsnotify_put_mark(&new->mark); goto Fallback; } @@ -293,7 +292,7 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); + fsnotify_put_mark(&new->mark); /* drop initial reference */ goto out; Fallback: @@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) entry = &chunk->mark; if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { - free_chunk(chunk); + fsnotify_put_mark(entry); return -ENOSPC; } @@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) insert_hash(chunk); spin_unlock(&hash_lock); spin_unlock(&entry->lock); + fsnotify_put_mark(entry); /* drop initial reference */ return 0; } @@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) fsnotify_duplicate_mark(chunk_entry, old_entry); if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { spin_unlock(&old_entry->lock); - free_chunk(chunk); + fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); return -ENOSPC; } @@ -444,8 +444,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); fsnotify_destroy_mark(old_entry); + fsnotify_put_mark(chunk_entry); /* drop initial reference */ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ - fsnotify_put_mark(old_entry); /* and kill it */ return 0; } @@ -916,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); evict_chunk(chunk); - fsnotify_put_mark(entry); + + /* + * We are guaranteed to have at least one reference to the mark from + * either the inode or the caller of fsnotify_destroy_mark(). + */ + BUG_ON(atomic_read(&entry->refcnt) < 1); } static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, diff --git a/kernel/cpu.c b/kernel/cpu.c index a4eb5227a19..14d32588ccc 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (pgdat->node_zonelists->_zonerefs->zone == NULL) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } #endif diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8c8bd652dd1..f33c7153b6d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -147,6 +147,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; +/* the type of hotplug event */ +enum hotplug_event { + CPUSET_CPU_OFFLINE, + CPUSET_MEM_OFFLINE, +}; + /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { @@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } /* - * Walk the specified cpuset subtree and look for empty cpusets. - * The tasks of such cpuset must be moved to a parent cpuset. + * Helper function to traverse cpusets. + * It can be used to walk the cpuset tree from top to bottom, completing + * one layer before dropping down to the next (thus always processing a + * node before any of its children). + */ +static struct cpuset *cpuset_next(struct list_head *queue) +{ + struct cpuset *cp; + struct cpuset *child; /* scans child cpusets of cp */ + struct cgroup *cont; + + if (list_empty(queue)) + return NULL; + + cp = list_first_entry(queue, struct cpuset, stack_list); + list_del(queue->next); + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, queue); + } + + return cp; +} + + +/* + * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory + * online/offline) and update the cpusets accordingly. + * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such + * cpuset must be moved to a parent cpuset. * * Called with cgroup_mutex held. We take callback_mutex to modify * cpus_allowed and mems_allowed. @@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * before dropping down to the next. It always processes a node before * any of its children. * - * For now, since we lack memory hot unplug, we'll never see a cpuset - * that has tasks along with an empty 'mems'. But if we did see such - * a cpuset, we'd handle it just like we do if its 'cpus' was empty. + * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY + * if all present pages from a node are offlined. */ -static void scan_for_empty_cpusets(struct cpuset *root) +static void +scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) { LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; + struct cpuset *cp; /* scans cpusets being updated */ static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); - while (!list_empty(&queue)) { - cp = list_first_entry(&queue, struct cpuset, stack_list); - list_del(queue.next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &queue); + switch (event) { + case CPUSET_CPU_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { + + /* Continue past cpusets with all cpus online */ + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) + continue; + + /* Remove offline cpus from this cpuset. */ + mutex_lock(&callback_mutex); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_active_mask); + mutex_unlock(&callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ + if (cpumask_empty(cp->cpus_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_cpumask(cp, NULL); } + break; - /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && - nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) - continue; + case CPUSET_MEM_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { - oldmems = cp->mems_allowed; + /* Continue past cpusets with all mems online */ + if (nodes_subset(cp->mems_allowed, + node_states[N_HIGH_MEMORY])) + continue; - /* Remove offline cpus and mems from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - nodes_and(cp->mems_allowed, cp->mems_allowed, + oldmems = cp->mems_allowed; + + /* Remove offline mems from this cpuset. */ + mutex_lock(&callback_mutex); + nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); - mutex_unlock(&callback_mutex); + mutex_unlock(&callback_mutex); - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed) || - nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else { - update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); + /* Move tasks from the empty cpuset to a parent */ + if (nodes_empty(cp->mems_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_nodemask(cp, &oldmems, NULL); } } } @@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root) * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * + * The only exception to this is suspend/resume, where we don't + * modify cpusets at all. + * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_active_mask on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). + * + * @cpu_online: Indicates whether this is a CPU online event (true) or + * a CPU offline event (false). */ -void cpuset_update_active_cpus(void) +void cpuset_update_active_cpus(bool cpu_online) { struct sched_domain_attr *attr; cpumask_var_t *doms; @@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void) mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); mutex_unlock(&callback_mutex); - scan_for_empty_cpusets(&top_cpuset); + + if (!cpu_online) + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); + ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); @@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void) /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. - * See also the previous routine cpuset_track_online_cpus(). + * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) @@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self, case MEM_OFFLINE: /* * needn't update top_cpuset.mems_allowed explicitly because - * scan_for_empty_cpusets() will update it. + * scan_cpusets_upon_hotplug() will update it. */ - scan_for_empty_cpusets(&top_cpuset); + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); break; default: break; diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 8b68ce78ff1..be7b33b73d3 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -12,6 +12,7 @@ #include <linux/kdb.h> #include <linux/kdebug.h> #include <linux/export.h> +#include <linux/hardirq.h> #include "kdb_private.h" #include "../debug_core.h" @@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks) if (atomic_read(&kgdb_setting_breakpoint)) reason = KDB_REASON_KEYBOARD; + if (in_nmi()) + reason = KDB_REASON_NMI; + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { if ((bp->bp_enabled) && (bp->bp_addr == addr)) { reason = KDB_REASON_BREAK; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index bb9520f0f6f..0a69d2adc4f 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -715,9 +715,6 @@ kdb_printit: /* check for having reached the LINES number of printed lines */ if (kdb_nextline == linecount) { char buf1[16] = ""; -#if defined(CONFIG_SMP) - char buf2[32]; -#endif /* Watch out for recursion here. Any routine that calls * kdb_printf will come back through here. And kdb_read @@ -732,14 +729,6 @@ kdb_printit: if (moreprompt == NULL) moreprompt = "more> "; -#if defined(CONFIG_SMP) - if (strchr(moreprompt, '%')) { - sprintf(buf2, moreprompt, get_cpu()); - put_cpu(); - moreprompt = buf2; - } -#endif - kdb_input_flush(); c = console_drivers; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 1f91413edb8..31df1706b9a 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -139,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); static char *__env[] = { #if defined(CONFIG_SMP) "PROMPT=[%d]kdb> ", - "MOREPROMPT=[%d]more> ", #else "PROMPT=kdb> ", - "MOREPROMPT=more> ", #endif + "MOREPROMPT=more> ", "RADIX=16", "MDCOUNT=8", /* lines of md output */ KDB_PLATFORM_ENV, @@ -1236,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, *cmdbuf = '\0'; *(cmd_hist[cmd_head]) = '\0'; - if (KDB_FLAG(ONLY_DO_DUMP)) { - /* kdb is off but a catastrophic error requires a dump. - * Take the dump and reboot. - * Turn on logging so the kdb output appears in the log - * buffer in the dump. - */ - const char *setargs[] = { "set", "LOGGING", "1" }; - kdb_set(2, setargs); - kdb_reboot(0, NULL); - /*NOTREACHED*/ - } - do_full_getstr: #if defined(CONFIG_SMP) snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6581a040f39..98d4597f43d 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -153,7 +153,8 @@ put_callchain_entry(int rctx) put_recursion_context(__get_cpu_var(callchain_recursion), rctx); } -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs) { int rctx; struct perf_callchain_entry *entry; @@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) } if (regs) { + /* + * Disallow cross-task user callchains. + */ + if (event->ctx->task && event->ctx->task != current) + goto exit_put; + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); } diff --git a/kernel/events/core.c b/kernel/events/core.c index f1cf0edeb39..b7935fcec7d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4039,7 +4039,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - data->callchain = perf_callchain(regs); + data->callchain = perf_callchain(event, regs); if (data->callchain) size += data->callchain->nr; @@ -5209,7 +5209,8 @@ static int perf_tp_event_match(struct perf_event *event, } void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head, int rctx) + struct pt_regs *regs, struct hlist_head *head, int rctx, + struct task_struct *task) { struct perf_sample_data data; struct perf_event *event; @@ -5228,6 +5229,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_swevent_event(event, count, &data, regs); } + /* + * If we got specified a target task, also iterate its context and + * deliver this event there too. + */ + if (task && task != current) { + struct perf_event_context *ctx; + struct trace_entry *entry = record; + + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + if (!ctx) + goto unlock; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->attr.type != PERF_TYPE_TRACEPOINT) + continue; + if (event->attr.config != entry->type) + continue; + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } +unlock: + rcu_read_unlock(); + } + perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index b0b107f90af..a096c19f2c2 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle, } /* Callchain handling */ -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(void); extern void put_callchain_buffers(void); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f93532748bc..c08a22d02f7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -32,6 +32,7 @@ #include <linux/swap.h> /* try_to_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ +#include "../../mm/internal.h" /* munlock_vma_page */ #include <linux/uprobes.h> @@ -112,14 +113,14 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register) return false; } -static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) +static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) { - loff_t vaddr; - - vaddr = vma->vm_start + offset; - vaddr -= vma->vm_pgoff << PAGE_SHIFT; + return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT); +} - return vaddr; +static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) +{ + return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); } /** @@ -127,25 +128,27 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) * based on replace_page in mm/ksm.c * * @vma: vma that holds the pte pointing to page + * @addr: address the old @page is mapped at * @page: the cowed page we are replacing by kpage * @kpage: the modified page we replace page by * * Returns 0 on success, -EFAULT on failure. */ -static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) +static int __replace_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page, struct page *kpage) { struct mm_struct *mm = vma->vm_mm; - unsigned long addr; spinlock_t *ptl; pte_t *ptep; + int err; - addr = page_address_in_vma(page, vma); - if (addr == -EFAULT) - return -EFAULT; + /* For try_to_free_swap() and munlock_vma_page() below */ + lock_page(page); + err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - return -EAGAIN; + goto unlock; get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); @@ -162,10 +165,16 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page_remove_rmap(page); if (!page_mapped(page)) try_to_free_swap(page); - put_page(page); pte_unmap_unlock(ptep, ptl); - return 0; + if (vma->vm_flags & VM_LOCKED) + munlock_vma_page(page); + put_page(page); + + err = 0; + unlock: + unlock_page(page); + return err; } /** @@ -206,45 +215,23 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; - struct address_space *mapping; void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; - struct uprobe *uprobe; int ret; + retry: /* Read the page with vaddr into memory */ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); if (ret <= 0) return ret; - ret = -EINVAL; - - /* - * We are interested in text pages only. Our pages of interest - * should be mapped for read and execute only. We desist from - * adding probes in write mapped pages since the breakpoints - * might end up in the file copy. - */ - if (!valid_vma(vma, is_swbp_insn(&opcode))) - goto put_out; - - uprobe = container_of(auprobe, struct uprobe, arch); - mapping = uprobe->inode->i_mapping; - if (mapping != vma->vm_file->f_mapping) - goto put_out; - ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) - goto put_out; + goto put_old; __SetPageUptodate(new_page); - /* - * lock page will serialize against do_wp_page()'s - * PageAnon() handling - */ - lock_page(old_page); /* copy the page now that we've got it stable */ vaddr_old = kmap_atomic(old_page); vaddr_new = kmap_atomic(new_page); @@ -257,17 +244,13 @@ retry: ret = anon_vma_prepare(vma); if (ret) - goto unlock_out; + goto put_new; - lock_page(new_page); - ret = __replace_page(vma, old_page, new_page); - unlock_page(new_page); + ret = __replace_page(vma, vaddr, old_page, new_page); -unlock_out: - unlock_page(old_page); +put_new: page_cache_release(new_page); - -put_out: +put_old: put_page(old_page); if (unlikely(ret == -EAGAIN)) @@ -791,7 +774,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) curr = info; info->mm = vma->vm_mm; - info->vaddr = vma_address(vma, offset); + info->vaddr = offset_to_vaddr(vma, offset); } mutex_unlock(&mapping->i_mmap_mutex); @@ -839,12 +822,13 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) goto free; down_write(&mm->mmap_sem); - vma = find_vma(mm, (unsigned long)info->vaddr); - if (!vma || !valid_vma(vma, is_register)) + vma = find_vma(mm, info->vaddr); + if (!vma || !valid_vma(vma, is_register) || + vma->vm_file->f_mapping->host != uprobe->inode) goto unlock; - if (vma->vm_file->f_mapping->host != uprobe->inode || - vma_address(vma, uprobe->offset) != info->vaddr) + if (vma->vm_start > info->vaddr || + vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; if (is_register) { @@ -960,59 +944,66 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume put_uprobe(uprobe); } -/* - * Of all the nodes that correspond to the given inode, return the node - * with the least offset. - */ -static struct rb_node *find_least_offset_node(struct inode *inode) +static struct rb_node * +find_node_in_range(struct inode *inode, loff_t min, loff_t max) { - struct uprobe u = { .inode = inode, .offset = 0}; struct rb_node *n = uprobes_tree.rb_node; - struct rb_node *close_node = NULL; - struct uprobe *uprobe; - int match; while (n) { - uprobe = rb_entry(n, struct uprobe, rb_node); - match = match_uprobe(&u, uprobe); - - if (uprobe->inode == inode) - close_node = n; - - if (!match) - return close_node; + struct uprobe *u = rb_entry(n, struct uprobe, rb_node); - if (match < 0) + if (inode < u->inode) { n = n->rb_left; - else + } else if (inode > u->inode) { n = n->rb_right; + } else { + if (max < u->offset) + n = n->rb_left; + else if (min > u->offset) + n = n->rb_right; + else + break; + } } - return close_node; + return n; } /* - * For a given inode, build a list of probes that need to be inserted. + * For a given range in vma, build a list of probes that need to be inserted. */ -static void build_probe_list(struct inode *inode, struct list_head *head) +static void build_probe_list(struct inode *inode, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *head) { - struct uprobe *uprobe; + loff_t min, max; unsigned long flags; - struct rb_node *n; - - spin_lock_irqsave(&uprobes_treelock, flags); - - n = find_least_offset_node(inode); + struct rb_node *n, *t; + struct uprobe *u; - for (; n; n = rb_next(n)) { - uprobe = rb_entry(n, struct uprobe, rb_node); - if (uprobe->inode != inode) - break; + INIT_LIST_HEAD(head); + min = vaddr_to_offset(vma, start); + max = min + (end - start) - 1; - list_add(&uprobe->pending_list, head); - atomic_inc(&uprobe->ref); + spin_lock_irqsave(&uprobes_treelock, flags); + n = find_node_in_range(inode, min, max); + if (n) { + for (t = n; t; t = rb_prev(t)) { + u = rb_entry(t, struct uprobe, rb_node); + if (u->inode != inode || u->offset < min) + break; + list_add(&u->pending_list, head); + atomic_inc(&u->ref); + } + for (t = n; (t = rb_next(t)); ) { + u = rb_entry(t, struct uprobe, rb_node); + if (u->inode != inode || u->offset > max) + break; + list_add(&u->pending_list, head); + atomic_inc(&u->ref); + } } - spin_unlock_irqrestore(&uprobes_treelock, flags); } @@ -1031,7 +1022,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head) int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; - struct uprobe *uprobe; + struct uprobe *uprobe, *u; struct inode *inode; int ret, count; @@ -1042,21 +1033,15 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!inode) return 0; - INIT_LIST_HEAD(&tmp_list); mutex_lock(uprobes_mmap_hash(inode)); - build_probe_list(inode, &tmp_list); + build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); ret = 0; count = 0; - list_for_each_entry(uprobe, &tmp_list, pending_list) { + list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!ret) { - loff_t vaddr = vma_address(vma, uprobe->offset); - - if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { - put_uprobe(uprobe); - continue; - } + unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); /* @@ -1097,12 +1082,15 @@ int uprobe_mmap(struct vm_area_struct *vma) void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct list_head tmp_list; - struct uprobe *uprobe; + struct uprobe *uprobe, *u; struct inode *inode; if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) return; + if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ + return; + if (!atomic_read(&vma->vm_mm->uprobes_state.count)) return; @@ -1110,21 +1098,17 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!inode) return; - INIT_LIST_HEAD(&tmp_list); mutex_lock(uprobes_mmap_hash(inode)); - build_probe_list(inode, &tmp_list); - - list_for_each_entry(uprobe, &tmp_list, pending_list) { - loff_t vaddr = vma_address(vma, uprobe->offset); - - if (vaddr >= start && vaddr < end) { - /* - * An unregister could have removed the probe before - * unmap. So check before we decrement the count. - */ - if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) - atomic_dec(&vma->vm_mm->uprobes_state.count); - } + build_probe_list(inode, vma, start, end, &tmp_list); + + list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); + /* + * An unregister could have removed the probe before + * unmap. So check before we decrement the count. + */ + if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) + atomic_dec(&vma->vm_mm->uprobes_state.count); put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); @@ -1463,12 +1447,9 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) vma = find_vma(mm, bp_vaddr); if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { - struct inode *inode; - loff_t offset; + struct inode *inode = vma->vm_file->f_mapping->host; + loff_t offset = vaddr_to_offset(vma, bp_vaddr); - inode = vma->vm_file->f_mapping->host; - offset = bp_vaddr - vma->vm_start; - offset += (vma->vm_pgoff << PAGE_SHIFT); uprobe = find_uprobe(inode, offset); } diff --git a/kernel/exit.c b/kernel/exit.c index d17f6c4ddfa..f65345f9e5b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -483,7 +483,7 @@ static void close_files(struct files_struct * files) rcu_read_unlock(); for (;;) { unsigned long set; - i = j * __NFDBITS; + i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; diff --git a/kernel/fork.c b/kernel/fork.c index ff1cad3b7bd..2c8857e1285 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -114,6 +114,10 @@ int nr_processes(void) return total; } +void __weak arch_release_task_struct(struct task_struct *tsk) +{ +} + #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static struct kmem_cache *task_struct_cachep; @@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node) return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); } -void __weak arch_release_task_struct(struct task_struct *tsk) { } - static inline void free_task_struct(struct task_struct *tsk) { - arch_release_task_struct(tsk); kmem_cache_free(task_struct_cachep, tsk); } #endif +void __weak arch_release_thread_info(struct thread_info *ti) +{ +} + #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR -void __weak arch_release_thread_info(struct thread_info *ti) { } /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a @@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { - arch_release_thread_info(ti); free_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else @@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static void free_thread_info(struct thread_info *ti) { - arch_release_thread_info(ti); kmem_cache_free(thread_info_cache, ti); } @@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); + arch_release_thread_info(tsk->stack); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); + arch_release_task_struct(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -298,23 +302,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; ti = alloc_thread_info_node(tsk, node); - if (!ti) { - free_task_struct(tsk); - return NULL; - } + if (!ti) + goto free_tsk; err = arch_dup_task_struct(tsk, orig); + if (err) + goto free_ti; - /* - * We defer looking at err, because we will need this setup - * for the clean up path to work correctly. - */ tsk->stack = ti; - setup_thread_stack(tsk, orig); - - if (err) - goto out; + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); @@ -338,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return tsk; -out: +free_ti: free_thread_info(ti); +free_tsk: free_task_struct(tsk); return NULL; } @@ -383,16 +381,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { - long pages = vma_pages(mpnt); - mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -pages); + -vma_pages(mpnt)); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len; - len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + unsigned long len = vma_pages(mpnt); + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; @@ -459,8 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; - if (file && uprobe_mmap(tmp)) - goto out; + if (file) + uprobe_mmap(tmp); } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); @@ -1310,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif diff --git a/kernel/futex.c b/kernel/futex.c index e2b0fb9a0b3..3717e7b306e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to - * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and - * complete the acquisition of the rt_mutex prior to returning to userspace. - * This ensures the rt_mutex maintains an owner when it has waiters; without - * one, the pi logic wouldn't know which task to boost/deboost, if there was a - * need to. + * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake + * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to + * userspace. This ensures the rt_mutex maintains an owner when it has waiters; + * without one, the pi logic would not know which task to boost/deboost, if + * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there * via the following: @@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (uaddr == uaddr2) + return -EINVAL; + if (!bitset) return -EINVAL; @@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * signal. futex_unlock_pi() will not destroy the lock_ptr nor * the pi_state. */ - WARN_ON(!&q.pi_state); + WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); debug_rt_mutex_free_waiter(&rt_waiter); @@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * fault, unlock the rt_mutex and return the fault to userspace. */ if (ret == -EFAULT) { - if (rt_mutex_owner(pi_mutex) == current) + if (pi_mutex && rt_mutex_owner(pi_mutex) == current) rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { /* diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index bdb18032555..131ca176b49 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -133,7 +133,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { irqreturn_t retval = IRQ_NONE; - unsigned int random = 0, irq = desc->irq_data.irq; + unsigned int flags = 0, irq = desc->irq_data.irq; do { irqreturn_t res; @@ -161,7 +161,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) /* Fall through to add to randomness */ case IRQ_HANDLED: - random |= action->flags; + flags |= action->flags; break; default: @@ -172,8 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) action = action->next; } while (action); - if (random & IRQF_SAMPLE_RANDOM) - add_interrupt_randomness(irq); + add_interrupt_randomness(irq, flags); if (!noirqdebug) note_interrupt(irq, desc, retval); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 38c5eb839c9..49a77727db4 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -10,6 +10,7 @@ #include <linux/mutex.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/topology.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/smp.h> @@ -45,7 +46,8 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, { struct irq_domain *domain; - domain = kzalloc(sizeof(*domain), GFP_KERNEL); + domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, + of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; @@ -138,6 +140,36 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, } /** + * irq_domain_add_simple() - Allocate and register a simple irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @size: total number of irqs in mapping + * @first_irq: first number of irq block assigned to the domain + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + * + * Allocates a legacy irq_domain if irq_base is positive or a linear + * domain otherwise. + * + * This is intended to implement the expected behaviour for most + * interrupt controllers which is that a linear mapping should + * normally be used unless the system requires a legacy mapping in + * order to support supplying interrupt numbers during non-DT + * registration of devices. + */ +struct irq_domain *irq_domain_add_simple(struct device_node *of_node, + unsigned int size, + unsigned int first_irq, + const struct irq_domain_ops *ops, + void *host_data) +{ + if (first_irq > 0) + return irq_domain_add_legacy(of_node, size, first_irq, 0, + ops, host_data); + else + return irq_domain_add_linear(of_node, size, ops, host_data); +} + +/** * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. * @of_node: pointer to interrupt controller's device tree node. * @size: total number of irqs in legacy mapping @@ -203,7 +235,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, * one can then use irq_create_mapping() to * explicitly change them */ - ops->map(domain, irq, hwirq); + if (ops->map) + ops->map(domain, irq, hwirq); /* Clear norequest flags */ irq_clear_status_flags(irq, IRQ_NOREQUEST); @@ -215,7 +248,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, EXPORT_SYMBOL_GPL(irq_domain_add_legacy); /** - * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. + * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. * @of_node: pointer to interrupt controller's device tree node. * @size: Number of interrupts in the domain. * @ops: map/unmap domain callbacks @@ -229,7 +262,8 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, struct irq_domain *domain; unsigned int *revmap; - revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); + revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL, + of_node_to_nid(of_node)); if (WARN_ON(!revmap)) return NULL; @@ -330,24 +364,112 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); -static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) +static void irq_domain_disassociate_many(struct irq_domain *domain, + unsigned int irq_base, int count) { - struct irq_data *irq_data = irq_get_irq_data(virq); + /* + * disassociate in reverse order; + * not strictly necessary, but nice for unwinding + */ + while (count--) { + int irq = irq_base + count; + struct irq_data *irq_data = irq_get_irq_data(irq); + irq_hw_number_t hwirq = irq_data->hwirq; + + if (WARN_ON(!irq_data || irq_data->domain != domain)) + continue; + + irq_set_status_flags(irq, IRQ_NOREQUEST); + + /* remove chip and handler */ + irq_set_chip_and_handler(irq, NULL, NULL); + + /* Make sure it's completed */ + synchronize_irq(irq); + + /* Tell the PIC about it */ + if (domain->ops->unmap) + domain->ops->unmap(domain, irq); + smp_mb(); - irq_data->hwirq = hwirq; - irq_data->domain = domain; - if (domain->ops->map(domain, virq, hwirq)) { - pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq); irq_data->domain = NULL; irq_data->hwirq = 0; - return -1; + + /* Clear reverse map */ + switch(domain->revmap_type) { + case IRQ_DOMAIN_MAP_LINEAR: + if (hwirq < domain->revmap_data.linear.size) + domain->revmap_data.linear.revmap[hwirq] = 0; + break; + case IRQ_DOMAIN_MAP_TREE: + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_data.tree, hwirq); + mutex_unlock(&revmap_trees_mutex); + break; + } } +} + +int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, + irq_hw_number_t hwirq_base, int count) +{ + unsigned int virq = irq_base; + irq_hw_number_t hwirq = hwirq_base; + int i, ret; + + pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, + of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + + for (i = 0; i < count; i++) { + struct irq_data *irq_data = irq_get_irq_data(virq + i); + + if (WARN(!irq_data, "error: irq_desc not allocated; " + "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) + return -EINVAL; + if (WARN(irq_data->domain, "error: irq_desc already associated; " + "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) + return -EINVAL; + }; + + for (i = 0; i < count; i++, virq++, hwirq++) { + struct irq_data *irq_data = irq_get_irq_data(virq); + + irq_data->hwirq = hwirq; + irq_data->domain = domain; + if (domain->ops->map) { + ret = domain->ops->map(domain, virq, hwirq); + if (ret != 0) { + pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", + virq, hwirq, ret); + WARN_ON(1); + irq_data->domain = NULL; + irq_data->hwirq = 0; + goto err_unmap; + } + } - irq_clear_status_flags(virq, IRQ_NOREQUEST); + switch (domain->revmap_type) { + case IRQ_DOMAIN_MAP_LINEAR: + if (hwirq < domain->revmap_data.linear.size) + domain->revmap_data.linear.revmap[hwirq] = virq; + break; + case IRQ_DOMAIN_MAP_TREE: + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); + mutex_unlock(&revmap_trees_mutex); + break; + } + + irq_clear_status_flags(virq, IRQ_NOREQUEST); + } return 0; + + err_unmap: + irq_domain_disassociate_many(domain, irq_base, i); + return -EINVAL; } +EXPORT_SYMBOL_GPL(irq_domain_associate_many); /** * irq_create_direct_mapping() - Allocate an irq for direct mapping @@ -364,10 +486,10 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) if (domain == NULL) domain = irq_default_domain; - BUG_ON(domain == NULL); - WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); + if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP)) + return 0; - virq = irq_alloc_desc_from(1, 0); + virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); if (!virq) { pr_debug("create_direct virq allocation failed\n"); return 0; @@ -380,7 +502,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) } pr_debug("create_direct obtained virq %d\n", virq); - if (irq_setup_virq(domain, virq, virq)) { + if (irq_domain_associate(domain, virq, virq)) { irq_free_desc(virq); return 0; } @@ -433,17 +555,16 @@ unsigned int irq_create_mapping(struct irq_domain *domain, hint = hwirq % nr_irqs; if (hint == 0) hint++; - virq = irq_alloc_desc_from(hint, 0); + virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node)); if (virq <= 0) - virq = irq_alloc_desc_from(1, 0); + virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; } - if (irq_setup_virq(domain, virq, hwirq)) { - if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) - irq_free_desc(virq); + if (irq_domain_associate(domain, virq, hwirq)) { + irq_free_desc(virq); return 0; } @@ -454,6 +575,44 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_create_mapping); +/** + * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs + * @domain: domain owning the interrupt range + * @irq_base: beginning of linux IRQ range + * @hwirq_base: beginning of hardware IRQ range + * @count: Number of interrupts to map + * + * This routine is used for allocating and mapping a range of hardware + * irqs to linux irqs where the linux irq numbers are at pre-defined + * locations. For use by controllers that already have static mappings + * to insert in to the domain. + * + * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time + * domain insertion. + * + * 0 is returned upon success, while any failure to establish a static + * mapping is treated as an error. + */ +int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, + irq_hw_number_t hwirq_base, int count) +{ + int ret; + + ret = irq_alloc_descs(irq_base, irq_base, count, + of_node_to_nid(domain->of_node)); + if (unlikely(ret < 0)) + return ret; + + ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); + if (unlikely(ret < 0)) { + irq_free_descs(irq_base, count); + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(irq_create_strict_mappings); + unsigned int irq_create_of_mapping(struct device_node *controller, const u32 *intspec, unsigned int intsize) { @@ -511,7 +670,6 @@ void irq_dispose_mapping(unsigned int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); struct irq_domain *domain; - irq_hw_number_t hwirq; if (!virq || !irq_data) return; @@ -524,33 +682,7 @@ void irq_dispose_mapping(unsigned int virq) if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) return; - irq_set_status_flags(virq, IRQ_NOREQUEST); - - /* remove chip and handler */ - irq_set_chip_and_handler(virq, NULL, NULL); - - /* Make sure it's completed */ - synchronize_irq(virq); - - /* Tell the PIC about it */ - if (domain->ops->unmap) - domain->ops->unmap(domain, virq); - smp_mb(); - - /* Clear reverse map */ - hwirq = irq_data->hwirq; - switch(domain->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < domain->revmap_data.linear.size) - domain->revmap_data.linear.revmap[hwirq] = 0; - break; - case IRQ_DOMAIN_MAP_TREE: - mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&domain->revmap_data.tree, hwirq); - mutex_unlock(&revmap_trees_mutex); - break; - } - + irq_domain_disassociate_many(domain, virq, 1); irq_free_desc(virq); } EXPORT_SYMBOL_GPL(irq_dispose_mapping); @@ -559,16 +691,11 @@ EXPORT_SYMBOL_GPL(irq_dispose_mapping); * irq_find_mapping() - Find a linux irq from an hw irq number. * @domain: domain owning this hardware interrupt * @hwirq: hardware irq number in that domain space - * - * This is a slow path, for use by generic code. It's expected that an - * irq controller implementation directly calls the appropriate low level - * mapping function. */ unsigned int irq_find_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - unsigned int i; - unsigned int hint = hwirq % nr_irqs; + struct irq_data *data; /* Look for default domain if nececssary */ if (domain == NULL) @@ -576,115 +703,47 @@ unsigned int irq_find_mapping(struct irq_domain *domain, if (domain == NULL) return 0; - /* legacy -> bail early */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + switch (domain->revmap_type) { + case IRQ_DOMAIN_MAP_LEGACY: return irq_domain_legacy_revmap(domain, hwirq); - - /* Slow path does a linear search of the map */ - if (hint == 0) - hint = 1; - i = hint; - do { - struct irq_data *data = irq_get_irq_data(i); + case IRQ_DOMAIN_MAP_LINEAR: + return irq_linear_revmap(domain, hwirq); + case IRQ_DOMAIN_MAP_TREE: + rcu_read_lock(); + data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + rcu_read_unlock(); + if (data) + return data->irq; + break; + case IRQ_DOMAIN_MAP_NOMAP: + data = irq_get_irq_data(hwirq); if (data && (data->domain == domain) && (data->hwirq == hwirq)) - return i; - i++; - if (i >= nr_irqs) - i = 1; - } while(i != hint); + return hwirq; + break; + } + return 0; } EXPORT_SYMBOL_GPL(irq_find_mapping); /** - * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number. - * @domain: domain owning this hardware interrupt - * @hwirq: hardware irq number in that domain space - * - * This is a fast path, for use by irq controller code that uses radix tree - * revmaps - */ -unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, - irq_hw_number_t hwirq) -{ - struct irq_data *irq_data; - - if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) - return irq_find_mapping(domain, hwirq); - - /* - * Freeing an irq can delete nodes along the path to - * do the lookup via call_rcu. - */ - rcu_read_lock(); - irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); - rcu_read_unlock(); - - /* - * If found in radix tree, then fine. - * Else fallback to linear lookup - this should not happen in practice - * as it means that we failed to insert the node in the radix tree. - */ - return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); -} -EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup); - -/** - * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. - * @domain: domain owning this hardware interrupt - * @virq: linux irq number - * @hwirq: hardware irq number in that domain space - * - * This is for use by irq controllers that use a radix tree reverse - * mapping for fast lookup. - */ -void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) -{ - struct irq_data *irq_data = irq_get_irq_data(virq); - - if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) - return; - - if (virq) { - mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); - mutex_unlock(&revmap_trees_mutex); - } -} -EXPORT_SYMBOL_GPL(irq_radix_revmap_insert); - -/** * irq_linear_revmap() - Find a linux irq from a hw irq number. * @domain: domain owning this hardware interrupt * @hwirq: hardware irq number in that domain space * - * This is a fast path, for use by irq controller code that uses linear - * revmaps. It does fallback to the slow path if the revmap doesn't exist - * yet and will create the revmap entry with appropriate locking + * This is a fast path that can be called directly by irq controller code to + * save a handful of instructions. */ unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { - unsigned int *revmap; - - if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) - return irq_find_mapping(domain, hwirq); + BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); - /* Check revmap bounds */ - if (unlikely(hwirq >= domain->revmap_data.linear.size)) - return irq_find_mapping(domain, hwirq); - - /* Check if revmap was allocated */ - revmap = domain->revmap_data.linear.revmap; - if (unlikely(revmap == NULL)) - return irq_find_mapping(domain, hwirq); - - /* Fill up revmap with slow path if no mapping found */ - if (unlikely(!revmap[hwirq])) - revmap[hwirq] = irq_find_mapping(domain, hwirq); + /* Check revmap bounds; complain if exceeded */ + if (WARN_ON(hwirq >= domain->revmap_data.linear.size)) + return 0; - return revmap[hwirq]; + return domain->revmap_data.linear.revmap[hwirq]; } EXPORT_SYMBOL_GPL(irq_linear_revmap); @@ -761,12 +820,6 @@ static int __init irq_debugfs_init(void) __initcall(irq_debugfs_init); #endif /* CONFIG_IRQ_DOMAIN_DEBUG */ -static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, - irq_hw_number_t hwirq) -{ - return 0; -} - /** * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings * @@ -829,7 +882,6 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); const struct irq_domain_ops irq_domain_simple_ops = { - .map = irq_domain_simple_map, .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 814c9ef6bba..4c69326aa77 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -893,22 +893,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) return -ENOSYS; if (!try_module_get(desc->owner)) return -ENODEV; - /* - * Some drivers like serial.c use request_irq() heavily, - * so we have to be careful not to interfere with a - * running system. - */ - if (new->flags & IRQF_SAMPLE_RANDOM) { - /* - * This function might sleep, we want to call it first, - * outside of the atomic block. - * Yes, this might clear the entropy pool if the wrong - * driver is attempted to be loaded, without actually - * installing a new handler, but is this really a problem, - * only the sysadmin is able to do this. - */ - rand_initialize_irq(irq); - } /* * Check whether the interrupt nests into another interrupt @@ -960,6 +944,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } /* + * Drivers are often written to work w/o knowledge about the + * underlying irq chip implementation, so a request for a + * threaded irq without a primary hard irq context handler + * requires the ONESHOT flag to be set. Some irq chips like + * MSI based interrupts are per se one shot safe. Check the + * chip flags, so we can avoid the unmask dance at the end of + * the threaded handler for those. + */ + if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) + new->flags &= ~IRQF_ONESHOT; + + /* * The following block of code has to be executed atomically */ raw_spin_lock_irqsave(&desc->lock, flags); @@ -1033,7 +1029,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ new->thread_mask = 1 << ffz(thread_mask); - } else if (new->handler == irq_default_primary_handler) { + } else if (new->handler == irq_default_primary_handler && + !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it @@ -1354,7 +1351,6 @@ EXPORT_SYMBOL(free_irq); * Flags: * * IRQF_SHARED Interrupt is shared - * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy * IRQF_TRIGGER_* Specify active edge(s) or level * */ diff --git a/kernel/kexec.c b/kernel/kexec.c index 4e2e472f6ae..0668d58d641 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void) void crash_save_vmcoreinfo(void) { - vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } diff --git a/kernel/kmod.c b/kernel/kmod.c index ff2c7cb86d7..6f99aead66c 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -45,6 +45,13 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; +/* + * kmod_thread_locker is used for deadlock avoidance. There is no explicit + * locking to protect this global - it is private to the singleton khelper + * thread and should only ever be modified by that thread. + */ +static const struct task_struct *kmod_thread_locker; + #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -221,6 +228,13 @@ fail: return 0; } +static int call_helper(void *data) +{ + /* Worker thread started blocking khelper thread. */ + kmod_thread_locker = current; + return ____call_usermodehelper(data); +} + static void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) @@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work) if (wait == UMH_WAIT_PROC) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); - else - pid = kernel_thread(____call_usermodehelper, sub_info, + else { + pid = kernel_thread(call_helper, sub_info, CLONE_VFORK | SIGCHLD); + /* Worker thread stopped blocking khelper thread. */ + kmod_thread_locker = NULL; + } switch (wait) { case UMH_NO_WAIT: @@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) retval = -EBUSY; goto out; } + /* + * Worker thread must not wait for khelper thread at below + * wait_for_completion() if the thread was created with CLONE_VFORK + * flag, for khelper thread is already waiting for the thread at + * wait_for_completion() in do_fork(). + */ + if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { + retval = -EBUSY; + goto out; + } sub_info->complete = &done; sub_info->wait = wait; @@ -577,6 +604,12 @@ unlock: return retval; } +/* + * call_usermodehelper_fns() will not run the caller-provided cleanup function + * if a memory allocation failure is experienced. So the caller might need to + * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform + * the necessaary cleanup within the caller. + */ int call_usermodehelper_fns( char *path, char **argv, char **envp, int wait, int (*init)(struct subprocess_info *info, struct cred *new), diff --git a/kernel/panic.c b/kernel/panic.c index d2a5f4ecc6d..e1b2822fff9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -75,6 +75,14 @@ void panic(const char *fmt, ...) int state = 0; /* + * Disable local interrupts. This will prevent panic_smp_self_stop + * from deadlocking the first cpu that invokes the panic, since + * there is nothing to prevent an interrupt handler (that runs + * after the panic_lock is acquired) from invoking panic again. + */ + local_irq_disable(); + + /* * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... diff --git a/kernel/printk.c b/kernel/printk.c index ac4bc9e7946..66a2ea37b57 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -216,6 +216,7 @@ struct log { */ static DEFINE_RAW_SPINLOCK(logbuf_lock); +#ifdef CONFIG_PRINTK /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -228,14 +229,19 @@ static u32 log_first_idx; /* index and sequence number of the next record to store in the buffer */ static u64 log_next_seq; -#ifdef CONFIG_PRINTK static u32 log_next_idx; +/* the next printk record to write to the console */ +static u64 console_seq; +static u32 console_idx; +static enum log_flags console_prev; + /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; static u32 clear_idx; -#define LOG_LINE_MAX 1024 +#define PREFIX_MAX 32 +#define LOG_LINE_MAX 1024 - PREFIX_MAX /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -360,6 +366,7 @@ static void log_store(int facility, int level, struct devkmsg_user { u64 seq; u32 idx; + enum log_flags prev; struct mutex lock; char buf[8192]; }; @@ -382,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, line = buf; for (i = 0; i < count; i++) { - if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) + if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { + ret = -EFAULT; goto out; + } line += iv[i].iov_len; } @@ -425,6 +434,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, struct log *msg; u64 ts_usec; size_t i; + char cont = '-'; size_t len; ssize_t ret; @@ -462,8 +472,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, msg = log_from_idx(user->idx); ts_usec = msg->ts_nsec; do_div(ts_usec, 1000); - len = sprintf(user->buf, "%u,%llu,%llu;", - (msg->facility << 3) | msg->level, user->seq, ts_usec); + + /* + * If we couldn't merge continuation line fragments during the print, + * export the stored flags to allow an optional external merge of the + * records. Merging the records isn't always neccessarily correct, like + * when we hit a race during printing. In most cases though, it produces + * better readable output. 'c' in the record flags mark the first + * fragment of a line, '+' the following. + */ + if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) + cont = 'c'; + else if ((msg->flags & LOG_CONT) || + ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) + cont = '+'; + + len = sprintf(user->buf, "%u,%llu,%llu,%c;", + (msg->facility << 3) | msg->level, + user->seq, ts_usec, cont); + user->prev = msg->flags; /* escape non-printable characters */ for (i = 0; i < msg->text_len; i++) { @@ -646,6 +673,15 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_buf_len); VMCOREINFO_SYMBOL(log_first_idx); VMCOREINFO_SYMBOL(log_next_idx); + /* + * Export struct log size and field offsets. User space tools can + * parse it and detect any changes to structure down the line. + */ + VMCOREINFO_STRUCT_SIZE(log); + VMCOREINFO_OFFSET(log, ts_nsec); + VMCOREINFO_OFFSET(log, len); + VMCOREINFO_OFFSET(log, text_len); + VMCOREINFO_OFFSET(log, dict_len); } #endif @@ -876,7 +912,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev, if (buf) { if (print_prefix(msg, syslog, NULL) + - text_len + 1>= size - len) + text_len + 1 >= size - len) break; if (prefix) @@ -907,7 +943,7 @@ static int syslog_print(char __user *buf, int size) struct log *msg; int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -930,7 +966,8 @@ static int syslog_print(char __user *buf, int size) skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); + n = msg_print_text(msg, syslog_prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); @@ -969,7 +1006,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) char *text; int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -997,6 +1034,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct log *msg = log_from_idx(idx); len += msg_print_text(msg, prev, true, NULL, 0); + prev = msg->flags; idx = log_next(idx); seq++; } @@ -1009,6 +1047,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct log *msg = log_from_idx(idx); len -= msg_print_text(msg, prev, true, NULL, 0); + prev = msg->flags; idx = log_next(idx); seq++; } @@ -1022,7 +1061,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); + textlen = msg_print_text(msg, prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); if (textlen < 0) { len = textlen; break; @@ -1349,20 +1389,36 @@ static struct cont { u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log level of first message */ + enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; -static void cont_flush(void) +static void cont_flush(enum log_flags flags) { if (cont.flushed) return; if (cont.len == 0) return; - log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, - NULL, 0, cont.buf, cont.len); - - cont.flushed = true; + if (cont.cons) { + /* + * If a fragment of this line was directly flushed to the + * console; wait for the console to pick up the rest of the + * line. LOG_NOCONS suppresses a duplicated output. + */ + log_store(cont.facility, cont.level, flags | LOG_NOCONS, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); + cont.flags = flags; + cont.flushed = true; + } else { + /* + * If no fragment of this line ever reached the console, + * just submit it to the store and free the buffer. + */ + log_store(cont.facility, cont.level, flags, 0, + NULL, 0, cont.buf, cont.len); + cont.len = 0; + } } static bool cont_add(int facility, int level, const char *text, size_t len) @@ -1371,7 +1427,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len) return false; if (cont.len + len > sizeof(cont.buf)) { - cont_flush(); + /* the line gets too long, split it up in separate records */ + cont_flush(LOG_CONT); return false; } @@ -1380,12 +1437,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len) cont.level = level; cont.owner = current; cont.ts_nsec = local_clock(); + cont.flags = 0; cont.cons = 0; cont.flushed = false; } memcpy(cont.buf + cont.len, text, len); cont.len += len; + + if (cont.len > (sizeof(cont.buf) * 80) / 100) + cont_flush(LOG_CONT); + return true; } @@ -1394,7 +1456,7 @@ static size_t cont_print_text(char *text, size_t size) size_t textlen = 0; size_t len; - if (cont.cons == 0) { + if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { textlen += print_time(cont.ts_nsec, text); size -= textlen; } @@ -1409,7 +1471,8 @@ static size_t cont_print_text(char *text, size_t size) } if (cont.flushed) { - text[textlen++] = '\n'; + if (cont.flags & LOG_NEWLINE) + text[textlen++] = '\n'; /* got everything, release buffer */ cont.len = 0; } @@ -1481,17 +1544,23 @@ asmlinkage int vprintk_emit(int facility, int level, lflags |= LOG_NEWLINE; } - /* strip syslog prefix and extract log level or control flags */ - if (text[0] == '<' && text[1] && text[2] == '>') { - switch (text[1]) { - case '0' ... '7': - if (level == -1) - level = text[1] - '0'; - case 'd': /* KERN_DEFAULT */ - lflags |= LOG_PREFIX; - case 'c': /* KERN_CONT */ - text += 3; - text_len -= 3; + /* strip kernel syslog prefix and extract log level or control flags */ + if (facility == 0) { + int kern_level = printk_get_level(text); + + if (kern_level) { + const char *end_of_header = printk_skip_level(text); + switch (kern_level) { + case '0' ... '7': + if (level == -1) + level = kern_level - '0'; + case 'd': /* KERN_DEFAULT */ + lflags |= LOG_PREFIX; + case 'c': /* KERN_CONT */ + break; + } + text_len -= end_of_header - text; + text = (char *)end_of_header; } } @@ -1507,7 +1576,7 @@ asmlinkage int vprintk_emit(int facility, int level, * or another task also prints continuation lines. */ if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) - cont_flush(); + cont_flush(LOG_NEWLINE); /* buffer line if possible, otherwise store it right away */ if (!cont_add(facility, level, text, text_len)) @@ -1525,7 +1594,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (cont.len && cont.owner == current) { if (!(lflags & LOG_PREFIX)) stored = cont_add(facility, level, text, text_len); - cont_flush(); + cont_flush(LOG_NEWLINE); } if (!stored) @@ -1616,9 +1685,20 @@ asmlinkage int printk(const char *fmt, ...) } EXPORT_SYMBOL(printk); -#else +#else /* CONFIG_PRINTK */ +#define LOG_LINE_MAX 0 +#define PREFIX_MAX 0 #define LOG_LINE_MAX 0 +static u64 syslog_seq; +static u32 syslog_idx; +static u64 console_seq; +static u32 console_idx; +static enum log_flags syslog_prev; +static u64 log_first_seq; +static u32 log_first_idx; +static u64 log_next_seq; +static enum log_flags console_prev; static struct cont { size_t len; size_t cons; @@ -1902,10 +1982,34 @@ void wake_up_klogd(void) this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } -/* the next printk record to write to the console */ -static u64 console_seq; -static u32 console_idx; -static enum log_flags console_prev; +static void console_cont_flush(char *text, size_t size) +{ + unsigned long flags; + size_t len; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + + if (!cont.len) + goto out; + + /* + * We still queue earlier records, likely because the console was + * busy. The earlier ones need to be printed before this one, we + * did not flush any fragment so far, so just let it queue up. + */ + if (console_seq < log_next_seq && !cont.cons) + goto out; + + len = cont_print_text(text, size); + raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); + call_console_drivers(cont.level, text, len); + start_critical_timings(); + local_irq_restore(flags); + return; +out: + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +} /** * console_unlock - unlock the console system @@ -1923,7 +2027,7 @@ static enum log_flags console_prev; */ void console_unlock(void) { - static char text[LOG_LINE_MAX]; + static char text[LOG_LINE_MAX + PREFIX_MAX]; static u64 seen_seq; unsigned long flags; bool wake_klogd = false; @@ -1937,19 +2041,7 @@ void console_unlock(void) console_may_schedule = 0; /* flush buffered message fragment immediately to console */ - raw_spin_lock_irqsave(&logbuf_lock, flags); - if (cont.len && (cont.cons < cont.len || cont.flushed)) { - size_t len; - - len = cont_print_text(text, sizeof(text)); - raw_spin_unlock(&logbuf_lock); - stop_critical_timings(); - call_console_drivers(cont.level, text, len); - start_critical_timings(); - local_irq_restore(flags); - } else - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - + console_cont_flush(text, sizeof(text)); again: for (;;) { struct log *msg; @@ -1986,6 +2078,7 @@ skip: * will properly dump everything later. */ msg->flags &= ~LOG_NOCONS; + console_prev = msg->flags; goto skip; } diff --git a/kernel/resource.c b/kernel/resource.c index dc8b4776444..34d45886ee8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -7,6 +7,8 @@ * Arbitrary resource management. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/export.h> #include <linux/errno.h> #include <linux/ioport.h> @@ -791,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name) { + int abort = 0; + write_lock(&resource_lock); - __reserve_region_with_split(root, start, end, name); + if (root->start > start || root->end < end) { + pr_err("requested range [0x%llx-0x%llx] not in root %pr\n", + (unsigned long long)start, (unsigned long long)end, + root); + if (start > root->end || end < root->start) + abort = 1; + else { + if (end > root->end) + end = root->end; + if (start < root->start) + start = root->start; + pr_err("fixing request to [0x%llx-0x%llx]\n", + (unsigned long long)start, + (unsigned long long)end); + } + dump_stack(); + } + if (!abort) + __reserve_region_with_split(root, start, end, name); write_unlock(&resource_lock); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 468bdd44c1b..fbf1fd098dc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. * * sched_move_task() holds both and thus holding either pins the cgroup, - * see set_task_rq(). + * see task_group(). * * Furthermore, all task_rq users should acquire both locks, see * task_rq_lock(). @@ -1910,12 +1910,12 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + trace_sched_switch(prev, next); sched_info_switch(prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); - trace_sched_switch(prev, next); } /** @@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) #endif +static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +{ + u64 temp = (__force u64) rtime; + + temp *= (__force u64) utime; + + if (sizeof(cputime_t) == 4) + temp = div_u64(temp, (__force u32) total); + else + temp = div64_u64(temp, (__force u64) total); + + return (__force cputime_t) temp; +} + void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; @@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) */ rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(utime, rtime, total); + else utime = rtime; /* @@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) total = cputime.utime + cputime.stime; rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) cputime.utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(cputime.utime, rtime, total); + else utime = rtime; sig->prev_utime = max(sig->prev_utime, utime); @@ -4340,9 +4346,7 @@ recheck: */ if (unlikely(policy == p->policy && (!rt_policy(policy) || param->sched_priority == p->rt_priority))) { - - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return 0; } @@ -6024,6 +6028,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * + * Iterate domains and sched_groups downward, assigning CPUs to be + * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing + * due to random perturbation self canceling, ie sw buddies pull + * their counterpart to their CPU's hw counterpart. + * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6037,8 +6046,40 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) + if (sd) { + struct sched_domain *tmp = sd; + struct sched_group *sg, *prev; + bool right; + + /* + * Traverse to first CPU in group, and count hops + * to cpu from there, switching direction on each + * hop, never ever pointing the last CPU rightward. + */ + do { + id = cpumask_first(sched_domain_span(tmp)); + prev = sg = tmp->groups; + right = 1; + + while (cpumask_first(sched_group_cpus(sg)) != id) + sg = sg->next; + + while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { + prev = sg; + sg = sg->next; + right = !right; + } + + /* A CPU went down, never point back to domain start. */ + if (right && cpumask_first(sched_group_cpus(sg->next)) == id) + right = false; + + sg = right ? sg->next : prev; + tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); + } while ((tmp = tmp->child)); + id = cpumask_first(sched_domain_span(sd)); + } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; @@ -7097,34 +7138,66 @@ match2: mutex_unlock(&sched_domains_mutex); } +static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ + /* * Update cpusets according to cpu_active mask. If cpusets are * disabled, cpuset_update_active_cpus() becomes a simple wrapper * around partition_sched_domains(). + * + * If we come here as part of a suspend/resume, don't touch cpusets because we + * want to restore it back to its original state upon resume anyway. */ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { + case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED_FROZEN: + + /* + * num_cpus_frozen tracks how many CPUs are involved in suspend + * resume sequence. As long as this is not the last online + * operation in the resume sequence, just build a single sched + * domain, ignoring cpusets. + */ + num_cpus_frozen--; + if (likely(num_cpus_frozen)) { + partition_sched_domains(1, NULL, NULL); + break; + } + + /* + * This is the last CPU online operation. So fall through and + * restore the original sched domains by considering the + * cpuset configurations. + */ + case CPU_ONLINE: case CPU_DOWN_FAILED: - cpuset_update_active_cpus(); - return NOTIFY_OK; + cpuset_update_active_cpus(true); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: - cpuset_update_active_cpus(); - return NOTIFY_OK; + cpuset_update_active_cpus(false); + break; + case CPU_DOWN_PREPARE_FROZEN: + num_cpus_frozen++; + partition_sched_domains(1, NULL, NULL); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } void __init sched_init_smp(void) @@ -7179,6 +7252,7 @@ int in_sched_functions(unsigned long addr) #ifdef CONFIG_CGROUP_SCHED struct task_group root_task_group; +LIST_HEAD(task_groups); #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); @@ -7589,6 +7663,7 @@ void sched_destroy_group(struct task_group *tg) */ void sched_move_task(struct task_struct *tsk) { + struct task_group *tg; int on_rq, running; unsigned long flags; struct rq *rq; @@ -7603,6 +7678,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); + tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + lockdep_is_held(&tsk->sighand->siglock)), + struct task_group, css); + tg = autogroup_task_group(tsk, tg); + tsk->sched_task_group = tg; + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) tsk->sched_class->task_move_group(tsk, on_rq); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index d72586fdf66..23aa789c53e 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -65,8 +65,8 @@ static int convert_prio(int prio) int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask) { - int idx = 0; - int task_pri = convert_prio(p->prio); + int idx = 0; + int task_pri = convert_prio(p->prio); if (task_pri >= MAX_RT_PRIO) return 0; @@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, */ void cpupri_set(struct cpupri *cp, int cpu, int newpri) { - int *currpri = &cp->cpu_to_pri[cpu]; - int oldpri = *currpri; - int do_mb = 0; + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + int do_mb = 0; newpri = convert_prio(newpri); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe..c219bf8d704 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; - struct sched_group *sg; - int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, check assigned siblings to find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; - - for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) - goto next; - } - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); + for_each_lower_domain(sd) { + if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(sd->idle_buddy)) + return sd->idle_buddy; } -done: + return target; } @@ -3068,18 +3054,24 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 +#define LBF_SOME_PINNED 0x04 struct lb_env { struct sched_domain *sd; - int src_cpu; struct rq *src_rq; + int src_cpu; int dst_cpu; struct rq *dst_rq; + struct cpumask *dst_grpmask; + int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + /* The set of CPUs under consideration for load-balancing */ + struct cpumask *cpus; + unsigned int flags; unsigned int loop; @@ -3145,9 +3137,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + int new_dst_cpu; + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + + /* + * Remember if this task can be migrated to any other cpu in + * our sched_group. We may want to revisit it if we couldn't + * meet load balance goals by pulling other tasks on src_cpu. + * + * Also avoid computing new_dst_cpu if we have already computed + * one in current iteration. + */ + if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + return 0; + + new_dst_cpu = cpumask_first_and(env->dst_grpmask, + tsk_cpus_allowed(p)); + if (new_dst_cpu < nr_cpu_ids) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = new_dst_cpu; + } return 0; } + + /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { @@ -3373,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_h_load(long cpu) { + struct rq *rq = cpu_rq(cpu); + unsigned long now = jiffies; + + if (rq->h_load_throttle == now) + return; + + rq->h_load_throttle = now; + rcu_read_lock(); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); rcu_read_unlock(); @@ -3642,8 +3664,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, const struct cpumask *cpus, - int *balance, struct sg_lb_stats *sgs) + int local_group, int *balance, struct sg_lb_stats *sgs) { unsigned long nr_running, max_nr_running, min_nr_running; unsigned long load, max_cpu_load, min_cpu_load; @@ -3660,7 +3681,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, max_nr_running = 0; min_nr_running = ~0UL; - for_each_cpu_and(i, sched_group_cpus(group), cpus) { + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); nr_running = rq->nr_running; @@ -3789,8 +3810,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct lb_env *env, - const struct cpumask *cpus, - int *balance, struct sd_lb_stats *sds) + int *balance, struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; @@ -3807,8 +3827,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(env, sg, load_idx, local_group, - cpus, balance, &sgs); + update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); if (local_group && !(*balance)) return; @@ -4044,7 +4063,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * to restore balance. * * @env: The load balancing environment. - * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. * @@ -4054,7 +4072,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * -find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) +find_busiest_group(struct lb_env *env, int *balance) { struct sd_lb_stats sds; @@ -4064,7 +4082,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(env, cpus, balance, &sds); + update_sd_lb_stats(env, balance, &sds); /* * this_cpu is not the appropriate cpu to perform load balancing at @@ -4144,8 +4162,7 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq *find_busiest_queue(struct lb_env *env, - struct sched_group *group, - const struct cpumask *cpus) + struct sched_group *group) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -4160,7 +4177,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (!capacity) capacity = fix_small_capacity(env->sd, group); - if (!cpumask_test_cpu(i, cpus)) + if (!cpumask_test_cpu(i, env->cpus)) continue; rq = cpu_rq(i); @@ -4227,7 +4244,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, active_balance = 0; + int ld_moved, cur_ld_moved, active_balance = 0; + int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -4237,16 +4255,19 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, + .cpus = cpus, }; cpumask_copy(cpus, cpu_active_mask); + max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(&env, cpus, balance); + group = find_busiest_group(&env, balance); if (*balance == 0) goto out_balanced; @@ -4256,7 +4277,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(&env, group, cpus); + busiest = find_busiest_queue(&env, group); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -4267,6 +4288,7 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; + lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -4279,12 +4301,17 @@ redo: env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); + update_h_load(env.src_cpu); more_balance: local_irq_save(flags); double_rq_lock(this_rq, busiest); - if (!env.loop) - update_h_load(env.src_cpu); - ld_moved += move_tasks(&env); + + /* + * cur_ld_moved - load moved in current iteration + * ld_moved - cumulative load moved across iterations + */ + cur_ld_moved = move_tasks(&env); + ld_moved += cur_ld_moved; double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4296,14 +4323,52 @@ more_balance: /* * some other cpu did the load balance for us. */ - if (ld_moved && this_cpu != smp_processor_id()) - resched_cpu(this_cpu); + if (cur_ld_moved && env.dst_cpu != smp_processor_id()) + resched_cpu(env.dst_cpu); + + /* + * Revisit (affine) tasks on src_cpu that couldn't be moved to + * us and move them to an alternate dst_cpu in our sched_group + * where they can run. The upper limit on how many times we + * iterate on same src_cpu is dependent on number of cpus in our + * sched_group. + * + * This changes load balance semantics a bit on who can move + * load to a given_cpu. In addition to the given_cpu itself + * (or a ilb_cpu acting on its behalf where given_cpu is + * nohz-idle), we now have balance_cpu in a position to move + * load to given_cpu. In rare situations, this may cause + * conflicts (balance_cpu and given_cpu/ilb_cpu deciding + * _independently_ and at _same_ time to move some load to + * given_cpu) causing exceess load to be moved to given_cpu. + * This however should not happen so much in practice and + * moreover subsequent load balance cycles should correct the + * excess load moved. + */ + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && + lb_iterations++ < max_lb_iterations) { + + this_rq = cpu_rq(env.new_dst_cpu); + env.dst_rq = this_rq; + env.dst_cpu = env.new_dst_cpu; + env.flags &= ~LBF_SOME_PINNED; + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + /* + * Go back to "more_balance" rather than "redo" since we + * need to continue with same src_cpu. + */ + goto more_balance; + } /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) + if (!cpumask_empty(cpus)) { + env.loop = 0; + env.loop_break = sched_nr_migrate_break; goto redo; + } goto out_balanced; } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 573e1ca0110..944cb68420e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -788,6 +788,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) const struct cpumask *span; span = sched_rt_period_mask(); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * FIXME: isolated CPUs should really leave the root task group, + * whether they are isolcpus or were isolated via cpusets, lest + * the timer run on a CPU which does not service all runqueues, + * potentially leaving other CPUs indefinitely throttled. If + * isolation is really required, the user will turn the throttle + * off to kill the perturbations it causes anyway. Meanwhile, + * this maintains functionality for boot and/or troubleshooting. + */ + if (rt_b == &root_task_group.rt_bandwidth) + span = cpu_online_mask; +#endif for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55844f24435..f6714d009e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex; struct cfs_rq; struct rt_rq; -static LIST_HEAD(task_groups); +extern struct list_head task_groups; struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH @@ -374,7 +374,11 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#endif +#ifdef CONFIG_SMP + unsigned long h_load_throttle; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + #ifdef CONFIG_RT_GROUP_SCHED struct list_head leaf_rt_rq_list; #endif @@ -538,22 +542,19 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. + * We cannot use task_subsys_state() and friends because the cgroup + * subsystem changes that value before the cgroup_subsys::attach() method + * is called, therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. */ static inline struct task_group *task_group(struct task_struct *p) { - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); + return p->sched_task_group; } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7b386e86fd2..da5eb5bed84 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->on_rq) + if (stop && stop->on_rq) { + stop->se.exec_start = rq->clock_task; return stop; + } return NULL; } @@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { + struct task_struct *curr = rq->curr; + u64 delta_exec; + + delta_exec = rq->clock_task - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq->clock_task; + cpuacct_charge(curr, delta_exec); } static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) @@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) static void set_curr_task_stop(struct rq *rq) { + struct task_struct *stop = rq->stop; + + stop->se.exec_start = rq->clock_task; } static void switched_to_stop(struct rq *rq, struct task_struct *p) diff --git a/kernel/softirq.c b/kernel/softirq.c index 671f9594e36..b73e681df09 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void) __u32 pending; int max_restart = MAX_SOFTIRQ_RESTART; int cpu; + unsigned long old_flags = current->flags; + + /* + * Mask out PF_MEMALLOC s current task context is borrowed for the + * softirq. A softirq handled such as network RX might set PF_MEMALLOC + * again if the socket is related to swap + */ + current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); account_system_vtime(current); @@ -265,6 +273,7 @@ restart: account_system_vtime(current); __local_bh_enable(SOFTIRQ_OFFSET); + tsk_restore_flags(current, old_flags, PF_MEMALLOC); } #ifndef __ARCH_HAS_DO_SOFTIRQ diff --git a/kernel/sys.c b/kernel/sys.c index 2d39a84cd85..241507f23ec 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2015,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; } me->pdeath_signal = arg2; - error = 0; break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); @@ -2029,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; } set_dumpable(me->mm, arg2); - error = 0; break; case PR_SET_UNALIGN: @@ -2056,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TIMING: if (arg2 != PR_TIMING_STATISTICAL) error = -EINVAL; - else - error = 0; break; - case PR_SET_NAME: comm[sizeof(me->comm)-1] = 0; if (strncpy_from_user(comm, (char __user *)arg2, @@ -2067,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EFAULT; set_task_comm(me, comm); proc_comm_connector(me); - return 0; + break; case PR_GET_NAME: get_task_comm(comm, me); if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) return -EFAULT; - return 0; + break; case PR_GET_ENDIAN: error = GET_ENDIAN(me, arg2); break; case PR_SET_ENDIAN: error = SET_ENDIAN(me, arg2); break; - case PR_GET_SECCOMP: error = prctl_get_seccomp(); break; @@ -2108,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, current->default_timer_slack_ns; else current->timer_slack_ns = arg2; - error = 0; break; case PR_MCE_KILL: if (arg4 | arg5) @@ -2134,7 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, default: return -EINVAL; } - error = 0; break; case PR_MCE_KILL_GET: if (arg2 | arg3 | arg4 | arg5) @@ -2153,7 +2145,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; - error = 0; break; case PR_GET_CHILD_SUBREAPER: error = put_user(me->signal->is_child_subreaper, @@ -2195,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info) argv_free(info->argv); } -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) +static int __orderly_poweroff(void) { int argc; - char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); + char **argv; static char *envp[] = { "HOME=/", "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL }; - int ret = -ENOMEM; + int ret; + argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); if (argv == NULL) { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", __func__, poweroff_cmd); - goto out; + return -ENOMEM; } ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, NULL, argv_cleanup, NULL); -out: - if (likely(!ret)) - return 0; - if (ret == -ENOMEM) argv_free(argv); - if (force) { + return ret; +} + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + int ret = __orderly_poweroff(); + + if (ret && force) { printk(KERN_WARNING "Failed to start orderly shutdown: " "forcing the issue\n"); - /* I guess this should try to kick off some daemon to - sync and poweroff asap. Or not even bother syncing - if we're doing an emergency shutdown? */ + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ emergency_sync(); kernel_power_off(); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4ab11879aeb..87174ef5916 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -30,6 +30,7 @@ #include <linux/security.h> #include <linux/ctype.h> #include <linux/kmemcheck.h> +#include <linux/kmemleak.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> @@ -174,6 +175,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_dostring_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; @@ -410,7 +416,7 @@ static struct ctl_table kern_table[] = { .data = core_pattern, .maxlen = CORENAME_MAX_SIZE, .mode = 0644, - .proc_handler = proc_dostring, + .proc_handler = proc_dostring_coredump, }, { .procname = "core_pipe_limit", @@ -1095,11 +1101,9 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { - .procname = "nr_pdflush_threads", - .data = &nr_pdflush_threads, - .maxlen = sizeof nr_pdflush_threads, - .mode = 0444 /* read-only*/, - .proc_handler = proc_dointvec, + .procname = "nr_pdflush_threads", + .mode = 0444 /* read-only */, + .proc_handler = pdflush_proc_obsolete, }, { .procname = "swappiness", @@ -1494,11 +1498,29 @@ static struct ctl_table fs_table[] = { #endif #endif { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_coredump, .extra1 = &zero, .extra2 = &two, }, @@ -1551,7 +1573,10 @@ static struct ctl_table dev_table[] = { int __init sysctl_init(void) { - register_sysctl_table(sysctl_base_table); + struct ctl_table_header *hdr; + + hdr = register_sysctl_table(sysctl_base_table); + kmemleak_not_leak(hdr); return 0; } @@ -2009,6 +2034,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +static void validate_coredump_safety(void) +{ + if (suid_dumpable == SUID_DUMPABLE_SAFE && + core_pattern[0] != '/' && core_pattern[0] != '|') { + printk(KERN_WARNING "Unsafe core_pattern used with "\ + "suid_dumpable=2. Pipe handler or fully qualified "\ + "core dump path required.\n"); + } +} + +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} + +static int proc_dostring_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} + static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index a650694883a..65bdcf198d4 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = { { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ - { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, + /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */ { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, /* VM_PAGEBUF unused */ /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ diff --git a/kernel/task_work.c b/kernel/task_work.c index 91d4e1742a0..d320d44903b 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -75,6 +75,7 @@ void task_work_run(void) p = q->next; q->func(q); q = p; + cond_resched(); } } } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index e66046456f4..d0a32796550 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, sizeof(struct cgroupstats)); + if (na == NULL) { + rc = -EMSGSIZE; + goto err; + } + stats = nla_data(na); memset(stats, 0, sizeof(*stats)); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a470154e040..46da0537c10 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@ * requested HZ value. It is also not recommended * for "tick-less" systems. */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier * conversion, the .shift value could be zero. However diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index b7fbadc5c97..24174b4d669 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock); /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; -/* ACTHZ period (nsecs): */ +/* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; static u64 tick_length; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f045cc50832..0c1485e42be 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -65,14 +65,14 @@ struct timekeeper { * used instead. */ struct timespec wall_to_monotonic; - /* time spent in suspend */ - struct timespec total_sleep_time; - /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ - struct timespec raw_time; /* Offset clock monotonic -> clock realtime */ ktime_t offs_real; + /* time spent in suspend */ + struct timespec total_sleep_time; /* Offset clock monotonic -> clock boottime */ ktime_t offs_boot; + /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ + struct timespec raw_time; /* Seqlock for all timekeeper values */ seqlock_t lock; }; @@ -108,13 +108,39 @@ static struct timespec tk_xtime(struct timekeeper *tk) static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec = ts->tv_sec; - tk->xtime_nsec = ts->tv_nsec << tk->shift; + tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; } static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec += ts->tv_sec; - tk->xtime_nsec += ts->tv_nsec << tk->shift; + tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; + tk_normalize_xtime(tk); +} + +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) +{ + struct timespec tmp; + + /* + * Verify consistency of: offset_real = -wall_to_monotonic + * before modifying anything + */ + set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, + -tk->wall_to_monotonic.tv_nsec); + WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); + tk->wall_to_monotonic = wtm; + set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec_to_ktime(tmp); +} + +static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) +{ + /* Verify consistency before modifying */ + WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); + + tk->total_sleep_time = t; + tk->offs_boot = timespec_to_ktime(t); } /** @@ -217,14 +243,6 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) return nsec + arch_gettimeoffset(); } -static void update_rt_offset(struct timekeeper *tk) -{ - struct timespec tmp, *wtm = &tk->wall_to_monotonic; - - set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); - tk->offs_real = timespec_to_ktime(tmp); -} - /* must hold write on timekeeper.lock */ static void timekeeping_update(struct timekeeper *tk, bool clearntp) { @@ -234,12 +252,10 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) tk->ntp_error = 0; ntp_clear(); } - update_rt_offset(tk); xt = tk_xtime(tk); update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); } - /** * timekeeping_forward_now - update clock to the current time * @@ -261,7 +277,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk->xtime_nsec += cycle_delta * tk->mult; /* If arch requires, add in gettimeoffset() */ - tk->xtime_nsec += arch_gettimeoffset() << tk->shift; + tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; tk_normalize_xtime(tk); @@ -277,18 +293,19 @@ static void timekeeping_forward_now(struct timekeeper *tk) */ void getnstimeofday(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs = 0; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - ts->tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(&timekeeper); + ts->tv_sec = tk->xtime_sec; + ts->tv_nsec = timekeeping_get_ns(tk); - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); timespec_add_ns(ts, nsecs); } @@ -296,19 +313,18 @@ EXPORT_SYMBOL(getnstimeofday); ktime_t ktime_get(void) { + struct timekeeper *tk = &timekeeper; unsigned int seq; s64 secs, nsecs; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - secs = timekeeper.xtime_sec + - timekeeper.wall_to_monotonic.tv_sec; - nsecs = timekeeping_get_ns(&timekeeper) + - timekeeper.wall_to_monotonic.tv_nsec; + seq = read_seqbegin(&tk->lock); + secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -327,18 +343,19 @@ EXPORT_SYMBOL_GPL(ktime_get); */ void ktime_get_ts(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec tomono; unsigned int seq; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - ts->tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(&timekeeper); - tomono = timekeeper.wall_to_monotonic; + seq = read_seqbegin(&tk->lock); + ts->tv_sec = tk->xtime_sec; + ts->tv_nsec = timekeeping_get_ns(tk); + tomono = tk->wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, ts->tv_nsec + tomono.tv_nsec); @@ -358,22 +375,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); */ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs_raw, nsecs_real; WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - *ts_raw = timekeeper.raw_time; - ts_real->tv_sec = timekeeper.xtime_sec; + *ts_raw = tk->raw_time; + ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; - nsecs_raw = timekeeping_get_ns_raw(&timekeeper); - nsecs_real = timekeeping_get_ns(&timekeeper); + nsecs_raw = timekeeping_get_ns_raw(tk); + nsecs_real = timekeeping_get_ns(tk); - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -406,28 +424,28 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(const struct timespec *tv) { + struct timekeeper *tk = &timekeeper; struct timespec ts_delta, xt; unsigned long flags; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + if (!timespec_valid(tv)) return -EINVAL; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); - xt = tk_xtime(&timekeeper); + xt = tk_xtime(tk); ts_delta.tv_sec = tv->tv_sec - xt.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; - timekeeper.wall_to_monotonic = - timespec_sub(timekeeper.wall_to_monotonic, ts_delta); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); - tk_set_xtime(&timekeeper, tv); + tk_set_xtime(tk, tv); - timekeeping_update(&timekeeper, true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -436,7 +454,6 @@ int do_settimeofday(const struct timespec *tv) } EXPORT_SYMBOL(do_settimeofday); - /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @tv: pointer to the timespec variable containing the offset @@ -445,28 +462,37 @@ EXPORT_SYMBOL(do_settimeofday); */ int timekeeping_inject_offset(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long flags; + struct timespec tmp; + int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); + /* Make sure the proposed value is valid */ + tmp = timespec_add(tk_xtime(tk), *ts); + if (!timespec_valid(&tmp)) { + ret = -EINVAL; + goto error; + } - tk_xtime_add(&timekeeper, ts); - timekeeper.wall_to_monotonic = - timespec_sub(timekeeper.wall_to_monotonic, *ts); + tk_xtime_add(tk, ts); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); - timekeeping_update(&timekeeper, true); +error: /* even if we error out, we forwarded the time, so call update */ + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); - return 0; + return ret; } EXPORT_SYMBOL(timekeeping_inject_offset); @@ -477,23 +503,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset); */ static int change_clocksource(void *data) { + struct timekeeper *tk = &timekeeper; struct clocksource *new, *old; unsigned long flags; new = (struct clocksource *) data; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); if (!new->enable || new->enable(new) == 0) { - old = timekeeper.clock; - tk_setup_internals(&timekeeper, new); + old = tk->clock; + tk_setup_internals(tk, new); if (old->disable) old->disable(old); } - timekeeping_update(&timekeeper, true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); return 0; } @@ -507,7 +534,9 @@ static int change_clocksource(void *data) */ void timekeeping_notify(struct clocksource *clock) { - if (timekeeper.clock == clock) + struct timekeeper *tk = &timekeeper; + + if (tk->clock == clock) return; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); @@ -536,35 +565,36 @@ EXPORT_SYMBOL_GPL(ktime_get_real); */ void getrawmonotonic(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; unsigned long seq; s64 nsecs; do { - seq = read_seqbegin(&timekeeper.lock); - nsecs = timekeeping_get_ns_raw(&timekeeper); - *ts = timekeeper.raw_time; + seq = read_seqbegin(&tk->lock); + nsecs = timekeeping_get_ns_raw(tk); + *ts = tk->raw_time; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); timespec_add_ns(ts, nsecs); } EXPORT_SYMBOL(getrawmonotonic); - /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; int ret; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); return ret; } @@ -574,15 +604,16 @@ int timekeeping_valid_for_hres(void) */ u64 timekeeping_max_deferment(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; u64 ret; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - ret = timekeeper.clock->max_idle_ns; + ret = tk->clock->max_idle_ns; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); return ret; } @@ -622,46 +653,56 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts) */ void __init timekeeping_init(void) { + struct timekeeper *tk = &timekeeper; struct clocksource *clock; unsigned long flags; - struct timespec now, boot; + struct timespec now, boot, tmp; read_persistent_clock(&now); + if (!timespec_valid(&now)) { + pr_warn("WARNING: Persistent clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + now.tv_sec = 0; + now.tv_nsec = 0; + } + read_boot_clock(&boot); + if (!timespec_valid(&boot)) { + pr_warn("WARNING: Boot clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + boot.tv_sec = 0; + boot.tv_nsec = 0; + } - seqlock_init(&timekeeper.lock); + seqlock_init(&tk->lock); ntp_init(); - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); - tk_setup_internals(&timekeeper, clock); + tk_setup_internals(tk, clock); - tk_set_xtime(&timekeeper, &now); - timekeeper.raw_time.tv_sec = 0; - timekeeper.raw_time.tv_nsec = 0; + tk_set_xtime(tk, &now); + tk->raw_time.tv_sec = 0; + tk->raw_time.tv_nsec = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) - boot = tk_xtime(&timekeeper); - - set_normalized_timespec(&timekeeper.wall_to_monotonic, - -boot.tv_sec, -boot.tv_nsec); - update_rt_offset(&timekeeper); - timekeeper.total_sleep_time.tv_sec = 0; - timekeeper.total_sleep_time.tv_nsec = 0; - write_sequnlock_irqrestore(&timekeeper.lock, flags); + boot = tk_xtime(tk); + + set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); + tk_set_wall_to_mono(tk, tmp); + + tmp.tv_sec = 0; + tmp.tv_nsec = 0; + tk_set_sleep_time(tk, tmp); + + write_sequnlock_irqrestore(&tk->lock, flags); } /* time in seconds when suspend began */ static struct timespec timekeeping_suspend_time; -static void update_sleep_time(struct timespec t) -{ - timekeeper.total_sleep_time = t; - timekeeper.offs_boot = timespec_to_ktime(t); -} - /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @delta: pointer to a timespec delta value @@ -677,13 +718,11 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, "sleep delta value!\n"); return; } - tk_xtime_add(tk, delta); - tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta); - update_sleep_time(timespec_add(tk->total_sleep_time, *delta)); + tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); + tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); } - /** * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec delta value @@ -696,6 +735,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, */ void timekeeping_inject_sleeptime(struct timespec *delta) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec ts; @@ -704,21 +744,20 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) return; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); - timekeeping_forward_now(&timekeeper); + timekeeping_forward_now(tk); - __timekeeping_inject_sleeptime(&timekeeper, delta); + __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(&timekeeper, true); + timekeeping_update(tk, true); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); /* signal hrtimers about time change */ clock_was_set(); } - /** * timekeeping_resume - Resumes the generic timekeeping subsystem. * @@ -728,6 +767,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) */ static void timekeeping_resume(void) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec ts; @@ -735,18 +775,18 @@ static void timekeeping_resume(void) clocksource_resume(); - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); - __timekeeping_inject_sleeptime(&timekeeper, &ts); + __timekeeping_inject_sleeptime(tk, &ts); } /* re-base the last cycle value */ - timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); - timekeeper.ntp_error = 0; + tk->clock->cycle_last = tk->clock->read(tk->clock); + tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(&timekeeper, false); - write_sequnlock_irqrestore(&timekeeper.lock, flags); + timekeeping_update(tk, false); + write_sequnlock_irqrestore(&tk->lock, flags); touch_softlockup_watchdog(); @@ -758,14 +798,15 @@ static void timekeeping_resume(void) static int timekeeping_suspend(void) { + struct timekeeper *tk = &timekeeper; unsigned long flags; struct timespec delta, delta_delta; static struct timespec old_delta; read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeping_forward_now(&timekeeper); + write_seqlock_irqsave(&tk->lock, flags); + timekeeping_forward_now(tk); timekeeping_suspended = 1; /* @@ -774,7 +815,7 @@ static int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time); + delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); delta_delta = timespec_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -787,7 +828,7 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -898,27 +939,29 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * the error. This causes the likely below to be unlikely. * * The proper fix is to avoid rounding up by using - * the high precision timekeeper.xtime_nsec instead of + * the high precision tk->xtime_nsec instead of * xtime.tv_nsec everywhere. Fixing this will take some * time. */ if (likely(error <= interval)) adj = 1; else - adj = timekeeping_bigadjust(tk, error, &interval, - &offset); - } else if (error < -interval) { - /* See comment above, this is just switched for the negative */ - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else - adj = timekeeping_bigadjust(tk, error, &interval, - &offset); - } else - return; + adj = timekeeping_bigadjust(tk, error, &interval, &offset); + } else { + if (error < -interval) { + /* See comment above, this is just switched for the negative */ + error >>= 2; + if (likely(error >= -interval)) { + adj = -1; + interval = -interval; + offset = -offset; + } else { + adj = timekeeping_bigadjust(tk, error, &interval, &offset); + } + } else { + goto out_adjust; + } + } if (unlikely(tk->clock->maxadj && (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { @@ -981,6 +1024,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) tk->xtime_nsec -= offset; tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; +out_adjust: /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource @@ -1003,7 +1047,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) } - /** * accumulate_nsecs_to_secs - Accumulates nsecs into secs * @@ -1024,15 +1067,21 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); - tk->xtime_sec += leap; - tk->wall_to_monotonic.tv_sec -= leap; - if (leap) - clock_was_set_delayed(); + if (unlikely(leap)) { + struct timespec ts; + + tk->xtime_sec += leap; + + ts.tv_sec = leap; + ts.tv_nsec = 0; + tk_set_wall_to_mono(tk, + timespec_sub(tk->wall_to_monotonic, ts)); + clock_was_set_delayed(); + } } } - /** * logarithmic_accumulation - shifted accumulation of cycles * @@ -1076,7 +1125,6 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, return offset; } - /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -1084,25 +1132,30 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, static void update_wall_time(void) { struct clocksource *clock; + struct timekeeper *tk = &timekeeper; cycle_t offset; int shift = 0, maxshift; unsigned long flags; s64 remainder; - write_seqlock_irqsave(&timekeeper.lock, flags); + write_seqlock_irqsave(&tk->lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) goto out; - clock = timekeeper.clock; + clock = tk->clock; #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - offset = timekeeper.cycle_interval; + offset = tk->cycle_interval; #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif + /* Check if there's really nothing to do */ + if (offset < tk->cycle_interval) + goto out; + /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, @@ -1111,19 +1164,19 @@ static void update_wall_time(void) * chunk in one go, and then try to consume the next smaller * doubled multiple. */ - shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); + shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); - while (offset >= timekeeper.cycle_interval) { - offset = logarithmic_accumulation(&timekeeper, offset, shift); - if(offset < timekeeper.cycle_interval<<shift) + while (offset >= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift); + if (offset < tk->cycle_interval<<shift) shift--; } /* correct the clock when NTP error is too big */ - timekeeping_adjust(&timekeeper, offset); + timekeeping_adjust(tk, offset); /* @@ -1135,21 +1188,21 @@ static void update_wall_time(void) * the vsyscall implementations are converted to use xtime_nsec * (shifted nanoseconds), this can be killed. */ - remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1); - timekeeper.xtime_nsec -= remainder; - timekeeper.xtime_nsec += 1 << timekeeper.shift; - timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift; + remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); + tk->xtime_nsec -= remainder; + tk->xtime_nsec += 1ULL << tk->shift; + tk->ntp_error += remainder << tk->ntp_error_shift; /* * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ - accumulate_nsecs_to_secs(&timekeeper); + accumulate_nsecs_to_secs(tk); - timekeeping_update(&timekeeper, false); + timekeeping_update(tk, false); out: - write_sequnlock_irqrestore(&timekeeper.lock, flags); + write_sequnlock_irqrestore(&tk->lock, flags); } @@ -1166,18 +1219,18 @@ out: */ void getboottime(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec boottime = { - .tv_sec = timekeeper.wall_to_monotonic.tv_sec + - timekeeper.total_sleep_time.tv_sec, - .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + - timekeeper.total_sleep_time.tv_nsec + .tv_sec = tk->wall_to_monotonic.tv_sec + + tk->total_sleep_time.tv_sec, + .tv_nsec = tk->wall_to_monotonic.tv_nsec + + tk->total_sleep_time.tv_nsec }; set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } EXPORT_SYMBOL_GPL(getboottime); - /** * get_monotonic_boottime - Returns monotonic time since boot * @ts: pointer to the timespec to be set @@ -1189,19 +1242,20 @@ EXPORT_SYMBOL_GPL(getboottime); */ void get_monotonic_boottime(struct timespec *ts) { + struct timekeeper *tk = &timekeeper; struct timespec tomono, sleep; unsigned int seq; WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&timekeeper.lock); - ts->tv_sec = timekeeper.xtime_sec; - ts->tv_nsec = timekeeping_get_ns(&timekeeper); - tomono = timekeeper.wall_to_monotonic; - sleep = timekeeper.total_sleep_time; + seq = read_seqbegin(&tk->lock); + ts->tv_sec = tk->xtime_sec; + ts->tv_nsec = timekeeping_get_ns(tk); + tomono = tk->wall_to_monotonic; + sleep = tk->total_sleep_time; - } while (read_seqretry(&timekeeper.lock, seq)); + } while (read_seqretry(&tk->lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); @@ -1231,31 +1285,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime); */ void monotonic_to_bootbased(struct timespec *ts) { - *ts = timespec_add(*ts, timekeeper.total_sleep_time); + struct timekeeper *tk = &timekeeper; + + *ts = timespec_add(*ts, tk->total_sleep_time); } EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return timekeeper.xtime_sec; + struct timekeeper *tk = &timekeeper; + + return tk->xtime_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return tk_xtime(&timekeeper); + struct timekeeper *tk = &timekeeper; + + return tk_xtime(tk); } struct timespec current_kernel_time(void) { + struct timekeeper *tk = &timekeeper; struct timespec now; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - now = tk_xtime(&timekeeper); - } while (read_seqretry(&timekeeper.lock, seq)); + now = tk_xtime(tk); + } while (read_seqretry(&tk->lock, seq)); return now; } @@ -1263,15 +1324,16 @@ EXPORT_SYMBOL(current_kernel_time); struct timespec get_monotonic_coarse(void) { + struct timekeeper *tk = &timekeeper; struct timespec now, mono; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - now = tk_xtime(&timekeeper); - mono = timekeeper.wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + now = tk_xtime(tk); + mono = tk->wall_to_monotonic; + } while (read_seqretry(&tk->lock, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1300,14 +1362,15 @@ void do_timer(unsigned long ticks) void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, struct timespec *wtom, struct timespec *sleep) { + struct timekeeper *tk = &timekeeper; unsigned long seq; do { - seq = read_seqbegin(&timekeeper.lock); - *xtim = tk_xtime(&timekeeper); - *wtom = timekeeper.wall_to_monotonic; - *sleep = timekeeper.total_sleep_time; - } while (read_seqretry(&timekeeper.lock, seq)); + seq = read_seqbegin(&tk->lock); + *xtim = tk_xtime(tk); + *wtom = tk->wall_to_monotonic; + *sleep = tk->total_sleep_time; + } while (read_seqretry(&tk->lock, seq)); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1321,19 +1384,20 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, */ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) { + struct timekeeper *tk = &timekeeper; ktime_t now; unsigned int seq; u64 secs, nsecs; do { - seq = read_seqbegin(&timekeeper.lock); + seq = read_seqbegin(&tk->lock); - secs = timekeeper.xtime_sec; - nsecs = timekeeping_get_ns(&timekeeper); + secs = tk->xtime_sec; + nsecs = timekeeping_get_ns(tk); - *offs_real = timekeeper.offs_real; - *offs_boot = timekeeper.offs_boot; - } while (read_seqretry(&timekeeper.lock, seq)); + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + } while (read_seqretry(&tk->lock, seq)); now = ktime_add_ns(ktime_set(secs, 0), nsecs); now = ktime_sub(now, *offs_real); @@ -1346,19 +1410,19 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) */ ktime_t ktime_get_monotonic_offset(void) { + struct timekeeper *tk = &timekeeper; unsigned long seq; struct timespec wtom; do { - seq = read_seqbegin(&timekeeper.lock); - wtom = timekeeper.wall_to_monotonic; - } while (read_seqretry(&timekeeper.lock, seq)); + seq = read_seqbegin(&tk->lock); + wtom = tk->wall_to_monotonic; + } while (read_seqretry(&tk->lock, seq)); return timespec_to_ktime(wtom); } EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); - /** * xtime_update() - advances the timekeeping infrastructure * @ticks: number of ticks, that have elapsed since the last call. diff --git a/kernel/timer.c b/kernel/timer.c index a61c09374eb..8c5e7b908c6 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1407,13 +1407,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif -#ifndef __alpha__ - -/* - * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this - * should be moved into arch/i386 instead? - */ - /** * sys_getpid - return the thread group id of the current process * @@ -1469,8 +1462,6 @@ SYSCALL_DEFINE0(getegid) return from_kgid_munged(current_user_ns(), current_egid()); } -#endif - static void process_timeout(unsigned long __data) { wake_up_process((struct task_struct *)__data); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a120f98c411..5c38c81496c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3187,10 +3187,10 @@ static int tracing_set_tracer(const char *buf) } destroy_trace_option_files(topts); - current_trace = t; + current_trace = &nop_trace; - topts = create_trace_option_files(current_trace); - if (current_trace->use_max_tr) { + topts = create_trace_option_files(t); + if (t->use_max_tr) { int cpu; /* we need to make per cpu buffer sizes equivalent */ for_each_tracing_cpu(cpu) { @@ -3210,6 +3210,7 @@ static int tracing_set_tracer(const char *buf) goto out; } + current_trace = t; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fee3752ae8f..8a6d2ee2086 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) head = this_cpu_ptr(event_function.perf_events); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, - 1, ®s, head); + 1, ®s, head, NULL); #undef ENTRY_SIZE } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c7b0c6a7db0..a426f410c06 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,6 +13,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/pstore.h> #include <linux/fs.h> #include "trace.h" @@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } +/* Our two options */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, + TRACE_FUNC_OPT_PSTORE = 0x2, +}; + +static struct tracer_flags func_flags; + static void function_trace_call(unsigned long ip, unsigned long parent_ip) { @@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { + /* + * So far tracing doesn't support multiple buffers, so + * we make an explicit call for now. + */ + if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) + pstore_ftrace_call(ip, parent_ip); pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); } @@ -158,15 +173,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly = .flags = FTRACE_OPS_FL_GLOBAL, }; -/* Our two options */ -enum { - TRACE_FUNC_OPT_STACK = 0x1, -}; - static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif +#ifdef CONFIG_PSTORE_FTRACE + { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, +#endif { } /* Always set a last empty entry */ }; @@ -204,10 +217,11 @@ static void tracing_stop_function_trace(void) static int func_set_flag(u32 old_flags, u32 bit, int set) { - if (bit == TRACE_FUNC_OPT_STACK) { + switch (bit) { + case TRACE_FUNC_OPT_STACK: /* do nothing if already set */ if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - return 0; + break; if (set) { unregister_ftrace_function(&trace_ops); @@ -217,10 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) register_ftrace_function(&trace_ops); } - return 0; + break; + case TRACE_FUNC_OPT_PSTORE: + break; + default: + return -EINVAL; } - return -EINVAL; + return 0; } static struct tracer function_trace __read_mostly = diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b31d3d5699f..1a2117043bb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ip, 1, regs, head, NULL); } /* Kretprobe profile handler */ @@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ret_ip, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 96fc7336909..6b245f64c8d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -532,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) (unsigned long *)&rec->args); head = this_cpu_ptr(sys_data->enter_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysenter_enable(struct ftrace_event_call *call) @@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; @@ -608,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->ret = syscall_get_return_value(current, regs); head = this_cpu_ptr(sys_data->exit_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysexit_enable(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2b36ac68549..03003cd7dd9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); out: preempt_enable(); |