From 96e65306b81351b656835c15931d1d237b252f27 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 2 Sep 2012 00:28:19 +0800 Subject: workqueue: UNBOUND -> REBIND morphing in rebind_workers() should be atomic The compiler may compile the following code into TWO write/modify instructions. worker->flags &= ~WORKER_UNBOUND; worker->flags |= WORKER_REBIND; so the other CPU may temporarily see worker->flags which doesn't have either WORKER_UNBOUND or WORKER_REBIND set and perform local wakeup prematurely. Fix it by using single explicit assignment via ACCESS_ONCE(). Because idle workers have another WORKER_NOT_RUNNING flag, this bug doesn't exist for them; however, update it to use the same pattern for consistency. tj: Applied the change to idle workers too and updated comments and patch description a bit. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/workqueue.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 692d97628a1..c462cd60c37 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1396,12 +1396,15 @@ retry: /* set REBIND and kick idle ones, we'll wait for these later */ for_each_worker_pool(pool, gcwq) { list_for_each_entry(worker, &pool->idle_list, entry) { + unsigned long worker_flags = worker->flags; + if (worker->flags & WORKER_REBIND) continue; - /* morph UNBOUND to REBIND */ - worker->flags &= ~WORKER_UNBOUND; - worker->flags |= WORKER_REBIND; + /* morph UNBOUND to REBIND atomically */ + worker_flags &= ~WORKER_UNBOUND; + worker_flags |= WORKER_REBIND; + ACCESS_ONCE(worker->flags) = worker_flags; idle_rebind.cnt++; worker->idle_rebind = &idle_rebind; @@ -1434,10 +1437,12 @@ retry: /* rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; + unsigned long worker_flags = worker->flags; - /* morph UNBOUND to REBIND */ - worker->flags &= ~WORKER_UNBOUND; - worker->flags |= WORKER_REBIND; + /* morph UNBOUND to REBIND atomically */ + worker_flags &= ~WORKER_UNBOUND; + worker_flags |= WORKER_REBIND; + ACCESS_ONCE(worker->flags) = worker_flags; if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(rebind_work))) -- cgit v1.2.3-70-g09d2 From 90beca5de591e12482a812f23a7f10690962ed4a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Sep 2012 23:12:33 -0700 Subject: workqueue: move WORKER_REBIND clearing in rebind_workers() to the end of the function This doesn't make any functional difference and is purely to help the next patch to be simpler. Signed-off-by: Tejun Heo Cc: Lai Jiangshan --- kernel/workqueue.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c462cd60c37..d79a18d0c42 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1422,19 +1422,7 @@ retry: goto retry; } - /* - * All idle workers are rebound and waiting for %WORKER_REBIND to - * be cleared inside idle_worker_rebind(). Clear and release. - * Clearing %WORKER_REBIND from this foreign context is safe - * because these workers are still guaranteed to be idle. - */ - for_each_worker_pool(pool, gcwq) - list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags &= ~WORKER_REBIND; - - wake_up_all(&gcwq->rebind_hold); - - /* rebind busy workers */ + /* all idle workers are rebound, rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; unsigned long worker_flags = worker->flags; @@ -1454,6 +1442,18 @@ retry: worker->scheduled.next, work_color_to_flags(WORK_NO_COLOR)); } + + /* + * All idle workers are rebound and waiting for %WORKER_REBIND to + * be cleared inside idle_worker_rebind(). Clear and release. + * Clearing %WORKER_REBIND from this foreign context is safe + * because these workers are still guaranteed to be idle. + */ + for_each_worker_pool(pool, gcwq) + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags &= ~WORKER_REBIND; + + wake_up_all(&gcwq->rebind_hold); } static struct worker *alloc_worker(void) -- cgit v1.2.3-70-g09d2 From ec58815ab0409a921a7c9744eb4ca44866b14d71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Sep 2012 23:16:32 -0700 Subject: workqueue: fix possible deadlock in idle worker rebinding Currently, rebind_workers() and idle_worker_rebind() are two-way interlocked. rebind_workers() waits for idle workers to finish rebinding and rebound idle workers wait for rebind_workers() to finish rebinding busy workers before proceeding. Unfortunately, this isn't enough. The second wait from idle workers is implemented as follows. wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); rebind_workers() clears WORKER_REBIND, wakes up the idle workers and then returns. If CPU hotplug cycle happens again before one of the idle workers finishes the above wait_event(), rebind_workers() will repeat the first part of the handshake - set WORKER_REBIND again and wait for the idle worker to finish rebinding - and this leads to deadlock because the idle worker would be waiting for WORKER_REBIND to clear. This is fixed by adding another interlocking step at the end - rebind_workers() now waits for all the idle workers to finish the above WORKER_REBIND wait before returning. This ensures that all rebinding steps are complete on all idle workers before the next hotplug cycle can happen. This problem was diagnosed by Lai Jiangshan who also posted a patch to fix the issue, upon which this patch is based. This is the minimal fix and further patches are scheduled for the next merge window to simplify the CPU hotplug path. Signed-off-by: Tejun Heo Original-patch-by: Lai Jiangshan LKML-Reference: <1346516916-1991-3-git-send-email-laijs@cn.fujitsu.com> --- kernel/workqueue.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d79a18d0c42..dc7b8458e27 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1326,6 +1326,15 @@ static void idle_worker_rebind(struct worker *worker) /* we did our part, wait for rebind_workers() to finish up */ wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); + + /* + * rebind_workers() shouldn't finish until all workers passed the + * above WORKER_REBIND wait. Tell it when done. + */ + spin_lock_irq(&worker->pool->gcwq->lock); + if (!--worker->idle_rebind->cnt) + complete(&worker->idle_rebind->done); + spin_unlock_irq(&worker->pool->gcwq->lock); } /* @@ -1448,12 +1457,28 @@ retry: * be cleared inside idle_worker_rebind(). Clear and release. * Clearing %WORKER_REBIND from this foreign context is safe * because these workers are still guaranteed to be idle. + * + * We need to make sure all idle workers passed WORKER_REBIND wait + * in idle_worker_rebind() before returning; otherwise, workers can + * get stuck at the wait if hotplug cycle repeats. */ - for_each_worker_pool(pool, gcwq) - list_for_each_entry(worker, &pool->idle_list, entry) + idle_rebind.cnt = 1; + INIT_COMPLETION(idle_rebind.done); + + for_each_worker_pool(pool, gcwq) { + list_for_each_entry(worker, &pool->idle_list, entry) { worker->flags &= ~WORKER_REBIND; + idle_rebind.cnt++; + } + } wake_up_all(&gcwq->rebind_hold); + + if (--idle_rebind.cnt) { + spin_unlock_irq(&gcwq->lock); + wait_for_completion(&idle_rebind.done); + spin_lock_irq(&gcwq->lock); + } } static struct worker *alloc_worker(void) -- cgit v1.2.3-70-g09d2 From 552a37e9360a293cd20e7f8ff1fb326a244c5f1e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 10 Sep 2012 10:03:33 -0700 Subject: workqueue: restore POOL_MANAGING_WORKERS This patch restores POOL_MANAGING_WORKERS which was replaced by pool->manager_mutex by 6037315269 "workqueue: use mutex for global_cwq manager exclusion". There's a subtle idle worker depletion bug across CPU hotplug events and we need to distinguish an actual manager and CPU hotplug preventing management. POOL_MANAGING_WORKERS will be used for the former and manager_mutex the later. This patch just lays POOL_MANAGING_WORKERS on top of the existing manager_mutex and doesn't introduce any synchronization changes. The next patch will update it. Note that this patch fixes a non-critical anomaly where too_many_workers() may return %true spuriously while CPU hotplug is in progress. While the issue could schedule idle timer spuriously, it didn't trigger any actual misbehavior. tj: Rewrote patch description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dc7b8458e27..383548ed0b5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -66,6 +66,7 @@ enum { /* pool flags */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ + POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ @@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = mutex_is_locked(&pool->manager_mutex); + bool managing = pool->flags & POOL_MANAGING_WORKERS; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -1827,6 +1828,7 @@ static bool manage_workers(struct worker *worker) if (!mutex_trylock(&pool->manager_mutex)) return ret; + pool->flags |= POOL_MANAGING_WORKERS; pool->flags &= ~POOL_MANAGE_WORKERS; /* @@ -1836,6 +1838,7 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); + pool->flags &= ~POOL_MANAGING_WORKERS; mutex_unlock(&pool->manager_mutex); return ret; } -- cgit v1.2.3-70-g09d2 From ee378aa49b594da9bda6a2c768cc5b2ad585f911 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 10 Sep 2012 10:03:44 -0700 Subject: workqueue: fix possible idle worker depletion across CPU hotplug To simplify both normal and CPU hotplug paths, worker management is prevented while CPU hoplug is in progress. This is achieved by CPU hotplug holding the same exclusion mechanism used by workers to ensure there's only one manager per pool. If someone else seems to be performing the manager role, workers proceed to execute work items. CPU hotplug using the same mechanism can lead to idle worker depletion because all workers could proceed to execute work items while CPU hotplug is in progress and CPU hotplug itself wouldn't actually perform the worker management duty - it doesn't guarantee that there's an idle worker left when it releases management. This idle worker depletion, under extreme circumstances, can break forward-progress guarantee and thus lead to deadlock. This patch fixes the bug by using separate mechanisms for manager exclusion among workers and hotplug exclusion. For manager exclusion, POOL_MANAGING_WORKERS which was restored by the previous patch is used. pool->manager_mutex is now only used for exclusion between the elected manager and CPU hotplug. The elected manager won't proceed without holding pool->manager_mutex. This ensures that the worker which won the manager position can't skip managing while CPU hotplug is in progress. It will block on manager_mutex and perform management after CPU hotplug is complete. Note that hotplug may happen while waiting for manager_mutex. A manager isn't either on idle or busy list and thus the hoplug code can't unbind/rebind it. Make the manager handle its own un/rebinding. tj: Updated comment and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 383548ed0b5..1e1373bcb3e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1825,10 +1825,45 @@ static bool manage_workers(struct worker *worker) struct worker_pool *pool = worker->pool; bool ret = false; - if (!mutex_trylock(&pool->manager_mutex)) + if (pool->flags & POOL_MANAGING_WORKERS) return ret; pool->flags |= POOL_MANAGING_WORKERS; + + /* + * To simplify both worker management and CPU hotplug, hold off + * management while hotplug is in progress. CPU hotplug path can't + * grab %POOL_MANAGING_WORKERS to achieve this because that can + * lead to idle worker depletion (all become busy thinking someone + * else is managing) which in turn can result in deadlock under + * extreme circumstances. Use @pool->manager_mutex to synchronize + * manager against CPU hotplug. + * + * manager_mutex would always be free unless CPU hotplug is in + * progress. trylock first without dropping @gcwq->lock. + */ + if (unlikely(!mutex_trylock(&pool->manager_mutex))) { + spin_unlock_irq(&pool->gcwq->lock); + mutex_lock(&pool->manager_mutex); + /* + * CPU hotplug could have happened while we were waiting + * for manager_mutex. Hotplug itself can't handle us + * because manager isn't either on idle or busy list, and + * @gcwq's state and ours could have deviated. + * + * As hotplug is now excluded via manager_mutex, we can + * simply try to bind. It will succeed or fail depending + * on @gcwq's current state. Try it and adjust + * %WORKER_UNBOUND accordingly. + */ + if (worker_maybe_bind_and_lock(worker)) + worker->flags &= ~WORKER_UNBOUND; + else + worker->flags |= WORKER_UNBOUND; + + ret = true; + } + pool->flags &= ~POOL_MANAGE_WORKERS; /* -- cgit v1.2.3-70-g09d2 From 960bd11bf2daf669d0d910428fd9ef5a15c3d7cb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 17 Sep 2012 15:42:31 -0700 Subject: workqueue: always clear WORKER_REBIND in busy_worker_rebind_fn() busy_worker_rebind_fn() didn't clear WORKER_REBIND if rebinding failed (CPU is down again). This used to be okay because the flag wasn't used for anything else. However, after 25511a477 "workqueue: reimplement CPU online rebinding to handle idle workers", WORKER_REBIND is also used to command idle workers to rebind. If not cleared, the worker may confuse the next CPU_UP cycle by having REBIND spuriously set or oops / get stuck by prematurely calling idle_worker_rebind(). WARNING: at /work/os/wq/kernel/workqueue.c:1323 worker_thread+0x4cd/0x5 00() Hardware name: Bochs Modules linked in: test_wq(O-) Pid: 33, comm: kworker/1:1 Tainted: G O 3.6.0-rc1-work+ #3 Call Trace: [] warn_slowpath_common+0x7f/0xc0 [] warn_slowpath_null+0x1a/0x20 [] worker_thread+0x4cd/0x500 [] kthread+0xbe/0xd0 [] kernel_thread_helper+0x4/0x10 ---[ end trace e977cf20f4661968 ]--- BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] worker_thread+0x360/0x500 PGD 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: test_wq(O-) CPU 0 Pid: 33, comm: kworker/1:1 Tainted: G W O 3.6.0-rc1-work+ #3 Bochs Bochs RIP: 0010:[] [] worker_thread+0x360/0x500 RSP: 0018:ffff88001e1c9de0 EFLAGS: 00010086 RAX: 0000000000000000 RBX: ffff88001e633e00 RCX: 0000000000004140 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000009 RBP: ffff88001e1c9ea0 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000002 R11: 0000000000000000 R12: ffff88001fc8d580 R13: ffff88001fc8d590 R14: ffff88001e633e20 R15: ffff88001e1c6900 FS: 0000000000000000(0000) GS:ffff88001fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 00000000130e8000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kworker/1:1 (pid: 33, threadinfo ffff88001e1c8000, task ffff88001e1c6900) Stack: ffff880000000000 ffff88001e1c9e40 0000000000000001 ffff88001e1c8010 ffff88001e519c78 ffff88001e1c9e58 ffff88001e1c6900 ffff88001e1c6900 ffff88001e1c6900 ffff88001e1c6900 ffff88001fc8d340 ffff88001fc8d340 Call Trace: [] kthread+0xbe/0xd0 [] kernel_thread_helper+0x4/0x10 Code: b1 00 f6 43 48 02 0f 85 91 01 00 00 48 8b 43 38 48 89 df 48 8b 00 48 89 45 90 e8 ac f0 ff ff 3c 01 0f 85 60 01 00 00 48 8b 53 50 <8b> 02 83 e8 01 85 c0 89 02 0f 84 3b 01 00 00 48 8b 43 38 48 8b RIP [] worker_thread+0x360/0x500 RSP CR2: 0000000000000000 There was no reason to keep WORKER_REBIND on failure in the first place - WORKER_UNBOUND is guaranteed to be set in such cases preventing incorrectly activating concurrency management. Always clear WORKER_REBIND. tj: Updated comment and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1e1373bcb3e..b80065a2450 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1349,8 +1349,16 @@ static void busy_worker_rebind_fn(struct work_struct *work) struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->pool->gcwq; - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_REBIND); + worker_maybe_bind_and_lock(worker); + + /* + * %WORKER_REBIND must be cleared even if the above binding failed; + * otherwise, we may confuse the next CPU_UP cycle or oops / get + * stuck by calling idle_worker_rebind() prematurely. If CPU went + * down again inbetween, %WORKER_UNBOUND would be set, so clearing + * %WORKER_REBIND is always safe. + */ + worker_clr_flags(worker, WORKER_REBIND); spin_unlock_irq(&gcwq->lock); } -- cgit v1.2.3-70-g09d2