From 20bdedafd2f63e0ba70991127f9b5c0826ebdb32 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 30 Jun 2023 21:28:53 +0900 Subject: [PATCH 01/31] workqueue: Warn attempt to flush system-wide workqueues. Based on commit c4f135d643823a86 ("workqueue: Wrap flush_workqueue() using a macro"), all in-tree users stopped flushing system-wide workqueues. Therefore, start emitting runtime message so that all out-of-tree users will understand that they need to update their code. Signed-off-by: Tetsuo Handa Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 44 +++------------------------------------ kernel/workqueue.c | 11 +++++----- 2 files changed, 8 insertions(+), 47 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 683efe29fa69..1a4223cbdb5f 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -569,6 +569,7 @@ static inline bool schedule_work(struct work_struct *work) /* * Detect attempt to flush system-wide workqueues at compile time when possible. + * Warn attempt to flush system-wide workqueues at runtime. * * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp * for reasons and steps for converting system-wide workqueues into local workqueues. @@ -576,52 +577,13 @@ static inline bool schedule_work(struct work_struct *work) extern void __warn_flushing_systemwide_wq(void) __compiletime_warning("Please avoid flushing system-wide workqueues."); -/** - * flush_scheduled_work - ensure that any scheduled work has run to completion. - * - * Forces execution of the kernel-global workqueue and blocks until its - * completion. - * - * It's very easy to get into trouble if you don't take great care. - * Either of the following situations will lead to deadlock: - * - * One of the work items currently on the workqueue needs to acquire - * a lock held by your code or its caller. - * - * Your code is running in the context of a work routine. - * - * They will be detected by lockdep when they occur, but the first might not - * occur very often. It depends on what work items are on the workqueue and - * what locks they need, which you have no control over. - * - * In most situations flushing the entire workqueue is overkill; you merely - * need to know that a particular work item isn't queued and isn't running. - * In such cases you should use cancel_delayed_work_sync() or - * cancel_work_sync() instead. - * - * Please stop calling this function! A conversion to stop flushing system-wide - * workqueues is in progress. This function will be removed after all in-tree - * users stopped calling this function. - */ -/* - * The background of commit 771c035372a036f8 ("deprecate the - * '__deprecated' attribute warnings entirely and for good") is that, - * since Linus builds all modules between every single pull he does, - * the standard kernel build needs to be _clean_ in order to be able to - * notice when new problems happen. Therefore, don't emit warning while - * there are in-tree users. - */ +/* Please stop using this function, for this function will be removed in near future. */ #define flush_scheduled_work() \ ({ \ - if (0) \ - __warn_flushing_systemwide_wq(); \ + __warn_flushing_systemwide_wq(); \ __flush_workqueue(system_wq); \ }) -/* - * Although there is no longer in-tree caller, for now just emit warning - * in order to give out-of-tree callers time to update. - */ #define flush_workqueue(wq) \ ({ \ struct workqueue_struct *_wq = (wq); \ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 02a8f402eeb5..f8891552fdd6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6571,10 +6571,9 @@ void __init workqueue_init(void) wq_watchdog_init(); } -/* - * Despite the naming, this is a no-op function which is here only for avoiding - * link error. Since compile-time warning may fail to catch, we will need to - * emit run-time warning from __flush_workqueue(). - */ -void __warn_flushing_systemwide_wq(void) { } +void __warn_flushing_systemwide_wq(void) +{ + pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n"); + dump_stack(); +} EXPORT_SYMBOL(__warn_flushing_systemwide_wq); From ace3c5499e61ef7c0433a7a297227a9bdde54a55 Mon Sep 17 00:00:00 2001 From: tiozhang Date: Thu, 29 Jun 2023 11:50:50 +0800 Subject: [PATCH 02/31] workqueue: add cmdline parameter `workqueue.unbound_cpus` to further constrain wq_unbound_cpumask at boot time Motivation of doing this is to better improve boot times for devices when we want to prevent our workqueue works from running on some specific CPUs, e,g, some CPUs are busy with interrupts. Signed-off-by: tiozhang Signed-off-by: Tejun Heo --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ kernel/workqueue.c | 17 +++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a1457995fd41..d1edee0fd5ec 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6964,6 +6964,13 @@ disables both lockup detectors. Default is 10 seconds. + workqueue.unbound_cpus= + [KNL,SMP] Specify to constrain one or some CPUs + to use in unbound workqueues. + Format: + By default, all online CPUs are available for + unbound workqueues. + workqueue.watchdog_thresh= If CONFIG_WQ_WATCHDOG is configured, workqueue can warn stall conditions and dump internal state to diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f8891552fdd6..83f8993af57c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -368,6 +368,9 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; +/* for further constrain wq_unbound_cpumask by cmdline parameter*/ +static struct cpumask wq_cmdline_cpumask __initdata; + /* CPU where unbound work was last round robin scheduled from this CPU */ static DEFINE_PER_CPU(int, wq_rr_cpu_last); @@ -6455,6 +6458,9 @@ void __init workqueue_init_early(void) cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + if (!cpumask_empty(&wq_cmdline_cpumask)) + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); /* initialize CPU pools */ @@ -6577,3 +6583,14 @@ void __warn_flushing_systemwide_wq(void) dump_stack(); } EXPORT_SYMBOL(__warn_flushing_systemwide_wq); + +static int __init workqueue_unbound_cpus_setup(char *str) +{ + if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) { + cpumask_clear(&wq_cmdline_cpumask); + pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n"); + } + + return 1; +} +__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup); From 9680540c0c56a1f75a2d6aab31bf38aa429aa9d9 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 4 Aug 2023 11:22:15 +0800 Subject: [PATCH 03/31] workqueue: use LIST_HEAD to initialize cull_list Use LIST_HEAD() to initialize cull_list instead of open-coding it. Signed-off-by: Yang Yingliang Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 83f8993af57c..a90f1e642548 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2304,9 +2304,8 @@ static void idle_worker_timeout(struct timer_list *t) static void idle_cull_fn(struct work_struct *work) { struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); - struct list_head cull_list; + LIST_HEAD(cull_list); - INIT_LIST_HEAD(&cull_list); /* * Grabbing wq_pool_attach_mutex here ensures an already-running worker * cannot proceed beyong worker_detach_from_pool() in its self-destruct @@ -3872,10 +3871,8 @@ static void rcu_free_pool(struct rcu_head *rcu) static void put_unbound_pool(struct worker_pool *pool) { DECLARE_COMPLETION_ONSTACK(detach_completion); - struct list_head cull_list; struct worker *worker; - - INIT_LIST_HEAD(&cull_list); + LIST_HEAD(cull_list); lockdep_assert_held(&wq_pool_mutex); From bc8b50c2dfac946c1beed782c1823e52cf55a352 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:22 -1000 Subject: [PATCH 04/31] workqueue: Drop the special locking rule for worker->flags and worker_pool->flags worker->flags used to be accessed from scheduler hooks without grabbing pool->lock for concurrency management. This is no longer true since 6d25be5782e4 ("sched/core, workqueues: Distangle worker accounting from rq lock"). Also, it's unclear why worker_pool->flags was using the "X" rule. All relevant users are accessing it under the pool lock. Let's drop the special "X" rule and use the "L" rule for these flag fields instead. While at it, replace the CONTEXT comment with lockdep_assert_held(). This allows worker_set/clr_flags() to be used from context which isn't the worker itself. This will be used later to implement assinging work items to workers before waking them up so that workqueue can have better control over which worker executes which work item on which CPU. The only actual changes are sanity checks. There shouldn't be any visible behavior changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 17 +++-------------- kernel/workqueue_internal.h | 2 +- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ae975a7c9f69..598009ad97d1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -122,11 +122,6 @@ enum { * * L: pool->lock protected. Access with pool->lock held. * - * X: During normal operation, modification requires pool->lock and should - * be done only from local cpu. Either disabling preemption on local - * cpu or grabbing pool->lock is enough for read access. If - * POOL_DISASSOCIATED is set, it's identical to L. - * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. @@ -160,7 +155,7 @@ struct worker_pool { int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ - unsigned int flags; /* X: flags */ + unsigned int flags; /* L: flags */ unsigned long watchdog_ts; /* L: watchdog timestamp */ bool cpu_stall; /* WD: stalled cpu bound pool */ @@ -910,15 +905,12 @@ static void wake_up_worker(struct worker_pool *pool) * @flags: flags to set * * Set @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && @@ -935,16 +927,13 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) * @flags: flags to clear * * Clear @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); worker->flags &= ~flags; diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 6b1d66e28269..f6275944ada7 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -48,7 +48,7 @@ struct worker { /* A: runs through worker->node */ unsigned long last_active; /* K: last active timestamp */ - unsigned int flags; /* X: flags */ + unsigned int flags; /* L: flags */ int id; /* I: worker id */ /* From c0ab017d43f4c4147f7ecf3ca3cb872a416e17c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:22 -1000 Subject: [PATCH 05/31] workqueue: Cleanups around process_scheduled_works() * Drop the trivial optimization in worker_thread() where it bypasses calling process_scheduled_works() if the first work item isn't linked. This is a mostly pointless micro optimization and gets in the way of improving the work processing path. * Consolidate pool->watchdog_ts updates in the two callers into process_scheduled_works(). Signed-off-by: Tejun Heo --- kernel/workqueue.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 598009ad97d1..20722de9937a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2652,9 +2652,15 @@ __acquires(&pool->lock) */ static void process_scheduled_works(struct worker *worker) { - while (!list_empty(&worker->scheduled)) { - struct work_struct *work = list_first_entry(&worker->scheduled, - struct work_struct, entry); + struct work_struct *work; + bool first = true; + + while ((work = list_first_entry_or_null(&worker->scheduled, + struct work_struct, entry))) { + if (first) { + worker->pool->watchdog_ts = jiffies; + first = false; + } process_one_work(worker, work); } } @@ -2735,17 +2741,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); - pool->watchdog_ts = jiffies; - - if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { - /* optimization path, not strictly necessary */ - process_one_work(worker, work); - if (unlikely(!list_empty(&worker->scheduled))) - process_scheduled_works(worker); - } else { - move_linked_works(work, &worker->scheduled, NULL); - process_scheduled_works(worker); - } + move_linked_works(work, &worker->scheduled, NULL); + process_scheduled_works(worker); } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP); @@ -2820,7 +2817,6 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; - bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2838,12 +2834,9 @@ repeat: WARN_ON_ONCE(!list_empty(scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) { if (get_work_pwq(work) == pwq) { - if (first) - pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); pwq->stats[PWQ_STAT_RESCUED]++; } - first = false; } if (!list_empty(scheduled)) { From fe089f87cccb066e8ad20f49ddf05e95adc1fa8d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:22 -1000 Subject: [PATCH 06/31] workqueue: Not all work insertion needs to wake up a worker insert_work() always tried to wake up a worker; however, the only time it needs to try to wake up a worker is when a new active work item is queued. When a work item goes on the inactive list or queueing a flush work item, there's no reason to try to wake up a worker. This patch moves the worker wakeup logic out of insert_work() and places it in the active new work item queueing path in __queue_work(). While at it: * __queue_work() is dereferencing pwq->pool repeatedly. Add local variable pool. * Every caller of insert_work() calls debug_work_activate(). Consolidate the invocations into insert_work(). * In __queue_work() pool->watchdog_ts update is relocated slightly. This is to better accommodate future changes. This makes wakeups more precise and will help the planned change to assign work items to workers before waking them up. No behavior changes intended. v2: WARN_ON_ONCE(pool != last_pool) added in __queue_work() to clarify as suggested by Lai. Signed-off-by: Tejun Heo Cc: Lai Jiangshan --- kernel/workqueue.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 20722de9937a..1332bd545b92 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1542,7 +1542,7 @@ fail: static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { - struct worker_pool *pool = pwq->pool; + debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack_noalloc(work); @@ -1551,9 +1551,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); get_pwq(pwq); - - if (__need_more_worker(pool)) - wake_up_worker(pool); } /* @@ -1607,8 +1604,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; - struct worker_pool *last_pool; - struct list_head *worklist; + struct worker_pool *last_pool, *pool; unsigned int work_flags; unsigned int req_cpu = cpu; @@ -1642,13 +1638,15 @@ retry: pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); } + pool = pwq->pool; + /* * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. */ last_pool = get_work_pool(work); - if (last_pool && last_pool != pwq->pool) { + if (last_pool && last_pool != pool) { struct worker *worker; raw_spin_lock(&last_pool->lock); @@ -1657,13 +1655,15 @@ retry: if (worker && worker->current_pwq->wq == wq) { pwq = worker->current_pwq; + pool = pwq->pool; + WARN_ON_ONCE(pool != last_pool); } else { /* meh... not running there, queue here */ raw_spin_unlock(&last_pool->lock); - raw_spin_lock(&pwq->pool->lock); + raw_spin_lock(&pool->lock); } } else { - raw_spin_lock(&pwq->pool->lock); + raw_spin_lock(&pool->lock); } /* @@ -1676,7 +1676,7 @@ retry: */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { - raw_spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pool->lock); cpu_relax(); goto retry; } @@ -1695,21 +1695,22 @@ retry: work_flags = work_color_to_flags(pwq->work_color); if (likely(pwq->nr_active < pwq->max_active)) { + if (list_empty(&pool->worklist)) + pool->watchdog_ts = jiffies; + trace_workqueue_activate_work(work); pwq->nr_active++; - worklist = &pwq->pool->worklist; - if (list_empty(worklist)) - pwq->pool->watchdog_ts = jiffies; + insert_work(pwq, work, &pool->worklist, work_flags); + + if (__need_more_worker(pool)) + wake_up_worker(pool); } else { work_flags |= WORK_STRUCT_INACTIVE; - worklist = &pwq->inactive_works; + insert_work(pwq, work, &pwq->inactive_works, work_flags); } - debug_work_activate(work); - insert_work(pwq, work, worklist, work_flags); - out: - raw_spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pool->lock); rcu_read_unlock(); } @@ -3012,7 +3013,6 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, pwq->nr_in_flight[work_color]++; work_flags |= work_color_to_flags(work_color); - debug_work_activate(&barr->work); insert_work(pwq, &barr->work, head, work_flags); } From ee1ceef72754427e5167743108c52f826fa4ca5b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: [PATCH 07/31] workqueue: Rename wq->cpu_pwqs to wq->cpu_pwq wq->cpu_pwqs is a percpu variable carraying one pointer to a pool_workqueue. The field name being plural is unusual and confusing. Rename it to singular. This patch doesn't cause any functional changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1332bd545b92..ea94ad63386e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -321,7 +321,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ + struct pool_workqueue __percpu *cpu_pwq; /* I: per-cpu pwqs */ struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ }; @@ -1635,7 +1635,7 @@ retry: } else { if (req_cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + pwq = per_cpu_ptr(wq->cpu_pwq, cpu); } pool = pwq->pool; @@ -3826,7 +3826,7 @@ static void rcu_free_wq(struct rcu_head *rcu) wq_free_lockdep(wq); if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_pwqs); + free_percpu(wq->cpu_pwq); else free_workqueue_attrs(wq->unbound_attrs); @@ -4518,13 +4518,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) int cpu, ret; if (!(wq->flags & WQ_UNBOUND)) { - wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); - if (!wq->cpu_pwqs) + wq->cpu_pwq = alloc_percpu(struct pool_workqueue); + if (!wq->cpu_pwq) return -ENOMEM; for_each_possible_cpu(cpu) { struct pool_workqueue *pwq = - per_cpu_ptr(wq->cpu_pwqs, cpu); + per_cpu_ptr(wq->cpu_pwq, cpu); struct worker_pool *cpu_pools = per_cpu(cpu_worker_pools, cpu); @@ -4905,7 +4905,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) cpu = smp_processor_id(); if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + pwq = per_cpu_ptr(wq->cpu_pwq, cpu); else pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); From 797e8345cbb0d2913300ee9838eb74cce19485cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: [PATCH 08/31] workqueue: Relocate worker and work management functions Collect first_idle_worker(), worker_enter/leave_idle(), find_worker_executing_work(), move_linked_works() and wake_up_worker() into one place. These functions will later be used to implement higher level worker management logic. No functional changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 340 ++++++++++++++++++++++----------------------- 1 file changed, 168 insertions(+), 172 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ea94ad63386e..8f5338e7331c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -869,36 +869,6 @@ static bool too_many_workers(struct worker_pool *pool) return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } -/* - * Wake up functions. - */ - -/* Return the first idle worker. Called with pool->lock held. */ -static struct worker *first_idle_worker(struct worker_pool *pool) -{ - if (unlikely(list_empty(&pool->idle_list))) - return NULL; - - return list_first_entry(&pool->idle_list, struct worker, entry); -} - -/** - * wake_up_worker - wake up an idle worker - * @pool: worker pool to wake worker from - * - * Wake up the first idle worker of @pool. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - */ -static void wake_up_worker(struct worker_pool *pool) -{ - struct worker *worker = first_idle_worker(pool); - - if (likely(worker)) - wake_up_process(worker->task); -} - /** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self @@ -947,6 +917,174 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) pool->nr_running++; } +/* Return the first idle worker. Called with pool->lock held. */ +static struct worker *first_idle_worker(struct worker_pool *pool) +{ + if (unlikely(list_empty(&pool->idle_list))) + return NULL; + + return list_first_entry(&pool->idle_list, struct worker, entry); +} + +/** + * worker_enter_idle - enter idle state + * @worker: worker which is entering idle state + * + * @worker is entering idle state. Update stats and idle timer if + * necessary. + * + * LOCKING: + * raw_spin_lock_irq(pool->lock). + */ +static void worker_enter_idle(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || + WARN_ON_ONCE(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev))) + return; + + /* can't use worker_set_flags(), also called from create_worker() */ + worker->flags |= WORKER_IDLE; + pool->nr_idle++; + worker->last_active = jiffies; + + /* idle_list is LIFO */ + list_add(&worker->entry, &pool->idle_list); + + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); + + /* Sanity check nr_running. */ + WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); +} + +/** + * worker_leave_idle - leave idle state + * @worker: worker which is leaving idle state + * + * @worker is leaving idle state. Update stats. + * + * LOCKING: + * raw_spin_lock_irq(pool->lock). + */ +static void worker_leave_idle(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) + return; + worker_clr_flags(worker, WORKER_IDLE); + pool->nr_idle--; + list_del_init(&worker->entry); +} + +/** + * find_worker_executing_work - find worker which is executing a work + * @pool: pool of interest + * @work: work to find worker for + * + * Find a worker which is executing @work on @pool by searching + * @pool->busy_hash which is keyed by the address of @work. For a worker + * to match, its current execution should match the address of @work and + * its work function. This is to avoid unwanted dependency between + * unrelated work executions through a work item being recycled while still + * being executed. + * + * This is a bit tricky. A work item may be freed once its execution + * starts and nothing prevents the freed area from being recycled for + * another work item. If the same work item address ends up being reused + * before the original execution finishes, workqueue will identify the + * recycled work item as currently executing and make it wait until the + * current execution finishes, introducing an unwanted dependency. + * + * This function checks the work item address and work function to avoid + * false positives. Note that this isn't complete as one may construct a + * work function which can introduce dependency onto itself through a + * recycled work item. Well, if somebody wants to shoot oneself in the + * foot that badly, there's only so much we can do, and if such deadlock + * actually occurs, it should be easy to locate the culprit work function. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock). + * + * Return: + * Pointer to worker which is executing @work if found, %NULL + * otherwise. + */ +static struct worker *find_worker_executing_work(struct worker_pool *pool, + struct work_struct *work) +{ + struct worker *worker; + + hash_for_each_possible(pool->busy_hash, worker, hentry, + (unsigned long)work) + if (worker->current_work == work && + worker->current_func == work->func) + return worker; + + return NULL; +} + +/** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out parameter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work. This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) + break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +/** + * wake_up_worker - wake up an idle worker + * @pool: worker pool to wake worker from + * + * Wake up the first idle worker of @pool. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock). + */ +static void wake_up_worker(struct worker_pool *pool) +{ + struct worker *worker = first_idle_worker(pool); + + if (likely(worker)) + wake_up_process(worker->task); +} + #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT /* @@ -1202,94 +1340,6 @@ work_func_t wq_worker_last_func(struct task_struct *task) return worker->last_func; } -/** - * find_worker_executing_work - find worker which is executing a work - * @pool: pool of interest - * @work: work to find worker for - * - * Find a worker which is executing @work on @pool by searching - * @pool->busy_hash which is keyed by the address of @work. For a worker - * to match, its current execution should match the address of @work and - * its work function. This is to avoid unwanted dependency between - * unrelated work executions through a work item being recycled while still - * being executed. - * - * This is a bit tricky. A work item may be freed once its execution - * starts and nothing prevents the freed area from being recycled for - * another work item. If the same work item address ends up being reused - * before the original execution finishes, workqueue will identify the - * recycled work item as currently executing and make it wait until the - * current execution finishes, introducing an unwanted dependency. - * - * This function checks the work item address and work function to avoid - * false positives. Note that this isn't complete as one may construct a - * work function which can introduce dependency onto itself through a - * recycled work item. Well, if somebody wants to shoot oneself in the - * foot that badly, there's only so much we can do, and if such deadlock - * actually occurs, it should be easy to locate the culprit work function. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - * - * Return: - * Pointer to worker which is executing @work if found, %NULL - * otherwise. - */ -static struct worker *find_worker_executing_work(struct worker_pool *pool, - struct work_struct *work) -{ - struct worker *worker; - - hash_for_each_possible(pool->busy_hash, worker, hentry, - (unsigned long)work) - if (worker->current_work == work && - worker->current_func == work->func) - return worker; - - return NULL; -} - -/** - * move_linked_works - move linked works to a list - * @work: start of series of works to be scheduled - * @head: target list to append @work to - * @nextp: out parameter for nested worklist walking - * - * Schedule linked works starting from @work to @head. Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work. This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - */ -static void move_linked_works(struct work_struct *work, struct list_head *head, - struct work_struct **nextp) -{ - struct work_struct *n; - - /* - * Linked worklist will always end before the end of the list, - * use NULL for list head. - */ - list_for_each_entry_safe_from(work, n, NULL, entry) { - list_move_tail(&work->entry, head); - if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) - break; - } - - /* - * If we're already inside safe list traversal and have moved - * multiple works to the scheduled queue, the next position - * needs to be updated. - */ - if (nextp) - *nextp = n; -} - /** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get @@ -1974,60 +2024,6 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) } EXPORT_SYMBOL(queue_rcu_work); -/** - * worker_enter_idle - enter idle state - * @worker: worker which is entering idle state - * - * @worker is entering idle state. Update stats and idle timer if - * necessary. - * - * LOCKING: - * raw_spin_lock_irq(pool->lock). - */ -static void worker_enter_idle(struct worker *worker) -{ - struct worker_pool *pool = worker->pool; - - if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || - WARN_ON_ONCE(!list_empty(&worker->entry) && - (worker->hentry.next || worker->hentry.pprev))) - return; - - /* can't use worker_set_flags(), also called from create_worker() */ - worker->flags |= WORKER_IDLE; - pool->nr_idle++; - worker->last_active = jiffies; - - /* idle_list is LIFO */ - list_add(&worker->entry, &pool->idle_list); - - if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) - mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); - - /* Sanity check nr_running. */ - WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); -} - -/** - * worker_leave_idle - leave idle state - * @worker: worker which is leaving idle state - * - * @worker is leaving idle state. Update stats. - * - * LOCKING: - * raw_spin_lock_irq(pool->lock). - */ -static void worker_leave_idle(struct worker *worker) -{ - struct worker_pool *pool = worker->pool; - - if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) - return; - worker_clr_flags(worker, WORKER_IDLE); - pool->nr_idle--; - list_del_init(&worker->entry); -} - static struct worker *alloc_worker(int node) { struct worker *worker; From fcecfa8f271acdf130acbb30842e7848a138af0f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: [PATCH 09/31] workqueue: Remove module param disable_numa and sysfs knobs pool_ids and numa Unbound workqueue CPU affinity is going to receive an overhaul and the NUMA specific knobs won't make sense anymore. Remove them. Also, the pool_ids knob was used for debugging and not really meaningful given that there is no visibility into the pools associated with those IDs. Remove it too. A future patch will improve overall visibility. Signed-off-by: Tejun Heo --- .../admin-guide/kernel-parameters.txt | 9 --- kernel/workqueue.c | 73 ------------------- 2 files changed, 82 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d1edee0fd5ec..2b89cbc39713 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6992,15 +6992,6 @@ threshold repeatedly. They are likely good candidates for using WQ_UNBOUND workqueues instead. - workqueue.disable_numa - By default, all work items queued to unbound - workqueues are affine to the NUMA nodes they're - issued on, which results in better behavior in - general. If NUMA affinity needs to be disabled for - whatever reason, this option can be used. Note - that this also can be controlled per-workqueue for - workqueues visible under /sys/bus/workqueue/. - workqueue.power_efficient Per-cpu workqueues are generally preferred because they show better performance thanks to cache diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8f5338e7331c..4347f493eca1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -340,9 +340,6 @@ static cpumask_var_t *wq_numa_possible_cpumask; static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); -static bool wq_disable_numa; -module_param_named(disable_numa, wq_disable_numa, bool, 0444); - /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); @@ -5794,10 +5791,8 @@ out_unlock: * * Unbound workqueues have the following extra attributes. * - * pool_ids RO int : the associated pool IDs for each node * nice RW int : nice value of the workers * cpumask RW mask : bitmask of allowed CPUs for the workers - * numa RW bool : whether enable NUMA affinity */ struct wq_device { struct workqueue_struct *wq; @@ -5850,28 +5845,6 @@ static struct attribute *wq_sysfs_attrs[] = { }; ATTRIBUTE_GROUPS(wq_sysfs); -static ssize_t wq_pool_ids_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - const char *delim = ""; - int node, written = 0; - - cpus_read_lock(); - rcu_read_lock(); - for_each_node(node) { - written += scnprintf(buf + written, PAGE_SIZE - written, - "%s%d:%d", delim, node, - unbound_pwq_by_node(wq, node)->pool->id); - delim = " "; - } - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock(); - cpus_read_unlock(); - - return written; -} - static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -5962,50 +5935,9 @@ out_unlock: return ret ?: count; } -static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int written; - - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - !wq->unbound_attrs->no_numa); - mutex_unlock(&wq->mutex); - - return written; -} - -static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int v, ret = -ENOMEM; - - apply_wqattrs_lock(); - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - goto out_unlock; - - ret = -EINVAL; - if (sscanf(buf, "%d", &v) == 1) { - attrs->no_numa = !v; - ret = apply_workqueue_attrs_locked(wq, attrs); - } - -out_unlock: - apply_wqattrs_unlock(); - free_workqueue_attrs(attrs); - return ret ?: count; -} - static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), - __ATTR(numa, 0644, wq_numa_show, wq_numa_store), __ATTR_NULL, }; @@ -6379,11 +6311,6 @@ static void __init wq_numa_init(void) if (num_possible_nodes() <= 1) return; - if (wq_disable_numa) { - pr_info("workqueue: NUMA affinity support disabled\n"); - return; - } - for_each_possible_cpu(cpu) { if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); From 967b494e2fd143a9c1a3201422aceadb5fa9fbfc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: [PATCH 10/31] workqueue: Use a kthread_worker to release pool_workqueues pool_workqueue release path is currently bounced to system_wq; however, this is a bit tricky because this bouncing occurs while holding a pool lock and thus has risk of causing a A-A deadlock. This is currently addressed by the fact that only unbound workqueues use this bouncing path and system_wq is a per-cpu workqueue. While this works, it's brittle and requires a work-around like setting the lockdep subclass for the lock of unbound pools. Besides, future changes will use the bouncing path for per-cpu workqueues too making the current approach unusable. Let's just use a dedicated kthread_worker to untangle the dependency. This is just one more kthread for all workqueues and makes the pwq release logic simpler and more robust. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4347f493eca1..01bf22c5d515 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -257,12 +257,12 @@ struct pool_workqueue { u64 stats[PWQ_NR_STATS]; /* - * Release of unbound pwq is punted to system_wq. See put_pwq() - * and pwq_unbound_release_workfn() for details. pool_workqueue - * itself is also RCU protected so that the first pwq can be - * determined without grabbing wq->mutex. + * Release of unbound pwq is punted to a kthread_worker. See put_pwq() + * and pwq_unbound_release_workfn() for details. pool_workqueue itself + * is also RCU protected so that the first pwq can be determined without + * grabbing wq->mutex. */ - struct work_struct unbound_release_work; + struct kthread_work unbound_release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); @@ -395,6 +395,13 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; +/* + * I: kthread_worker to release pwq's. pwq release needs to be bounced to a + * process context while holding a pool lock. Bounce to a dedicated kthread + * worker to avoid A-A deadlocks. + */ +static struct kthread_worker *pwq_release_worker; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -1366,14 +1373,10 @@ static void put_pwq(struct pool_workqueue *pwq) if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) return; /* - * @pwq can't be released under pool->lock, bounce to - * pwq_unbound_release_workfn(). This never recurses on the same - * pool->lock as this path is taken only for unbound workqueues and - * the release work item is scheduled on a per-cpu workqueue. To - * avoid lockdep warning, unbound pool->locks are given lockdep - * subclass of 1 in get_unbound_pool(). + * @pwq can't be released under pool->lock, bounce to a dedicated + * kthread_worker to avoid A-A deadlocks. */ - schedule_work(&pwq->unbound_release_work); + kthread_queue_work(pwq_release_worker, &pwq->unbound_release_work); } /** @@ -3965,7 +3968,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) if (!pool || init_worker_pool(pool) < 0) goto fail; - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); pool->node = target_node; @@ -3999,10 +4001,10 @@ static void rcu_free_pwq(struct rcu_head *rcu) } /* - * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt - * and needs to be destroyed. + * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero + * refcnt and needs to be destroyed. */ -static void pwq_unbound_release_workfn(struct work_struct *work) +static void pwq_unbound_release_workfn(struct kthread_work *work) { struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, unbound_release_work); @@ -4110,7 +4112,8 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); - INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); + kthread_init_work(&pwq->unbound_release_work, + pwq_unbound_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ @@ -6433,6 +6436,9 @@ static void __init wq_cpu_intensive_thresh_init(void) if (wq_cpu_intensive_thresh_us != ULONG_MAX) return; + pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + BUG_ON(IS_ERR(pwq_release_worker)); + /* * The default of 10ms is derived from the fact that most modern (as of * 2023) processors can do a lot in 10ms and that it's just below what From 687a9aa56f811b381e63f7f8f9149428ac708a3b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: [PATCH 11/31] workqueue: Make per-cpu pool_workqueues allocated and released like unbound ones Currently, all per-cpu pwq's (pool_workqueue's) are allocated directly through a per-cpu allocation and thus, unlike unbound workqueues, not reference counted. This difference in lifetime management between the two types is a bit confusing. Unbound workqueues are currently accessed through wq->numa_pwq_tbl[] which isn't suitiable for the planned CPU locality related improvements. The plan is to unify pwq handling across per-cpu and unbound workqueues so that they're always accessed through wq->cpu_pwq. In preparation, this patch makes per-cpu pwq's to be allocated, reference counted and released the same way as unbound pwq's. wq->cpu_pwq now holds pointers to pwq's instead of containing them directly. pwq_unbound_release_workfn() is renamed to pwq_release_workfn() as it's now also used for per-cpu work items. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 74 +++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 01bf22c5d515..05bf5427124a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -258,11 +258,11 @@ struct pool_workqueue { /* * Release of unbound pwq is punted to a kthread_worker. See put_pwq() - * and pwq_unbound_release_workfn() for details. pool_workqueue itself - * is also RCU protected so that the first pwq can be determined without + * and pwq_release_workfn() for details. pool_workqueue itself is also + * RCU protected so that the first pwq can be determined without * grabbing wq->mutex. */ - struct kthread_work unbound_release_work; + struct kthread_work release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); @@ -321,7 +321,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu *cpu_pwq; /* I: per-cpu pwqs */ + struct pool_workqueue __percpu **cpu_pwq; /* I: per-cpu pwqs */ struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ }; @@ -1370,13 +1370,11 @@ static void put_pwq(struct pool_workqueue *pwq) lockdep_assert_held(&pwq->pool->lock); if (likely(--pwq->refcnt)) return; - if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) - return; /* * @pwq can't be released under pool->lock, bounce to a dedicated * kthread_worker to avoid A-A deadlocks. */ - kthread_queue_work(pwq_release_worker, &pwq->unbound_release_work); + kthread_queue_work(pwq_release_worker, &pwq->release_work); } /** @@ -1685,7 +1683,7 @@ retry: } else { if (req_cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); - pwq = per_cpu_ptr(wq->cpu_pwq, cpu); + pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); } pool = pwq->pool; @@ -4004,31 +4002,30 @@ static void rcu_free_pwq(struct rcu_head *rcu) * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero * refcnt and needs to be destroyed. */ -static void pwq_unbound_release_workfn(struct kthread_work *work) +static void pwq_release_workfn(struct kthread_work *work) { struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, - unbound_release_work); + release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; bool is_last = false; /* - * when @pwq is not linked, it doesn't hold any reference to the + * When @pwq is not linked, it doesn't hold any reference to the * @wq, and @wq is invalid to access. */ if (!list_empty(&pwq->pwqs_node)) { - if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) - return; - mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); mutex_unlock(&wq->mutex); } - mutex_lock(&wq_pool_mutex); - put_unbound_pool(pool); - mutex_unlock(&wq_pool_mutex); + if (wq->flags & WQ_UNBOUND) { + mutex_lock(&wq_pool_mutex); + put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + } call_rcu(&pwq->rcu, rcu_free_pwq); @@ -4112,8 +4109,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); - kthread_init_work(&pwq->unbound_release_work, - pwq_unbound_release_workfn); + kthread_init_work(&pwq->release_work, pwq_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ @@ -4514,20 +4510,25 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) int cpu, ret; if (!(wq->flags & WQ_UNBOUND)) { - wq->cpu_pwq = alloc_percpu(struct pool_workqueue); + wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); if (!wq->cpu_pwq) - return -ENOMEM; + goto enomem; for_each_possible_cpu(cpu) { - struct pool_workqueue *pwq = + struct pool_workqueue **pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); - struct worker_pool *cpu_pools = - per_cpu(cpu_worker_pools, cpu); + struct worker_pool *pool = + &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]); - init_pwq(pwq, wq, &cpu_pools[highpri]); + *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, + pool->node); + if (!*pwq_p) + goto enomem; + + init_pwq(*pwq_p, wq, pool); mutex_lock(&wq->mutex); - link_pwq(pwq); + link_pwq(*pwq_p); mutex_unlock(&wq->mutex); } return 0; @@ -4546,6 +4547,15 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) cpus_read_unlock(); return ret; + +enomem: + if (wq->cpu_pwq) { + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(wq->cpu_pwq, cpu)); + free_percpu(wq->cpu_pwq); + wq->cpu_pwq = NULL; + } + return -ENOMEM; } static int wq_clamp_max_active(int max_active, unsigned int flags, @@ -4719,7 +4729,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; - int node; + int cpu, node; /* * Remove it from sysfs first so that sanity check failure doesn't @@ -4779,12 +4789,8 @@ void destroy_workqueue(struct workqueue_struct *wq) mutex_unlock(&wq_pool_mutex); if (!(wq->flags & WQ_UNBOUND)) { - wq_unregister_lockdep(wq); - /* - * The base ref is never dropped on per-cpu pwqs. Directly - * schedule RCU free. - */ - call_rcu(&wq->rcu, rcu_free_wq); + for_each_possible_cpu(cpu) + put_pwq_unlocked(*per_cpu_ptr(wq->cpu_pwq, cpu)); } else { /* * We're the sole accessor of @wq at this point. Directly @@ -4901,7 +4907,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) cpu = smp_processor_id(); if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwq, cpu); + pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); else pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); From 4cbfd3de737b9d00544ff0f673cb75fc37bffb6a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: [PATCH 12/31] workqueue: Call wq_update_unbound_numa() on all CPUs in NUMA node on CPU hotplug When a CPU went online or offline, wq_update_unbound_numa() was called only on the CPU which was going up or down. This works fine because all CPUs on the same NUMA node share the same pool_workqueue slot - one CPU updating it updates it for everyone in the node. However, future changes will make each CPU use a separate pool_workqueue even when they're sharing the same worker_pool, which requires updating pool_workqueue's for all CPUs which may be sharing the same pool_workqueue on hotplug. To accommodate the planned changes, this patch updates workqueue_on/offline_cpu() so that they call wq_update_unbound_numa() for all CPUs sharing the same NUMA node as the CPU going up or down. In the current code, the second+ calls would be noops and there shouldn't be any behavior changes. * As wq_update_unbound_numa() is now called on multiple CPUs per each hotplug event, @cpu is renamed to @hotplug_cpu and another @cpu argument is added. The former indicates the CPU being hot[un]plugged and the latter the CPU whose pool_workqueue is being updated. * In wq_update_unbound_numa(), cpu_off is renamed to off_cpu for consistency with the new @hotplug_cpu. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 05bf5427124a..9f4341885f60 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4422,7 +4422,8 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, /** * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug * @wq: the target workqueue - * @cpu: the CPU coming up or going down + * @cpu: the CPU to update pool association for + * @hotplug_cpu: the CPU coming up or going down * @online: whether @cpu is coming up or going down * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and @@ -4442,10 +4443,10 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, * CPU_DOWN_PREPARE. */ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, - bool online) + int hotplug_cpu, bool online) { int node = cpu_to_node(cpu); - int cpu_off = online ? -1 : cpu; + int off_cpu = online ? -1 : hotplug_cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; cpumask_t *cpumask; @@ -4473,7 +4474,7 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, * and create a new one if they don't match. If the target cpumask * equals the default pwq's, the default pwq should be used. */ - if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) { + if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, off_cpu, cpumask)) { if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) return; } else { @@ -5514,8 +5515,15 @@ int workqueue_online_cpu(unsigned int cpu) } /* update NUMA affinity of unbound workqueues */ - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, true); + list_for_each_entry(wq, &workqueues, list) { + int tcpu; + + for_each_possible_cpu(tcpu) { + if (cpu_to_node(tcpu) == cpu_to_node(cpu)) { + wq_update_unbound_numa(wq, tcpu, cpu, true); + } + } + } mutex_unlock(&wq_pool_mutex); return 0; @@ -5533,8 +5541,15 @@ int workqueue_offline_cpu(unsigned int cpu) /* update NUMA affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, false); + list_for_each_entry(wq, &workqueues, list) { + int tcpu; + + for_each_possible_cpu(tcpu) { + if (cpu_to_node(tcpu) == cpu_to_node(cpu)) { + wq_update_unbound_numa(wq, tcpu, cpu, false); + } + } + } mutex_unlock(&wq_pool_mutex); return 0; @@ -6509,7 +6524,8 @@ void __init workqueue_init(void) } list_for_each_entry(wq, &workqueues, list) { - wq_update_unbound_numa(wq, smp_processor_id(), true); + wq_update_unbound_numa(wq, smp_processor_id(), smp_processor_id(), + true); WARN(init_rescuer(wq), "workqueue: failed to create early rescuer for %s", wq->name); From 636b927eba5bc633753f8eb80f35e1d5be806e51 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: [PATCH 13/31] workqueue: Make unbound workqueues to use per-cpu pool_workqueues A pwq (pool_workqueue) represents an association between a workqueue and a worker_pool. When a work item is queued, the workqueue selects the pwq to use, which in turn determines the pool, and queues the work item to the pool through the pwq. pwq is also what implements the maximum concurrency limit - @max_active. As a per-cpu workqueue should be assocaited with a different worker_pool on each CPU, it always had per-cpu pwq's that are accessed through wq->cpu_pwq. However, unbound workqueues were sharing a pwq within each NUMA node by default. The sharing has several downsides: * Because @max_active is per-pwq, the meaning of @max_active changes depending on the machine configuration and whether workqueue NUMA locality support is enabled. * Makes per-cpu and unbound code deviate. * Gets in the way of making workqueue CPU locality awareness more flexible. This patch makes unbound workqueues use per-cpu pwq's the same way per-cpu workqueues do by making the following changes: * wq->numa_pwq_tbl[] is removed and unbound workqueues now use wq->cpu_pwq just like per-cpu workqueues. wq->cpu_pwq is now RCU protected for unbound workqueues. * numa_pwq_tbl_install() is renamed to install_unbound_pwq() and installs the specified pwq to the target CPU's wq->cpu_pwq. * apply_wqattrs_prepare() now always allocates a separate pwq for each CPU unless the workqueue is ordered. If ordered, all CPUs use wq->dfl_pwq. This makes the return value of wq_calc_node_cpumask() unnecessary. It now returns void. * @max_active now means the same thing for both per-cpu and unbound workqueues. WQ_UNBOUND_MAX_ACTIVE now equals WQ_MAX_ACTIVE and documentation is updated accordingly. WQ_UNBOUND_MAX_ACTIVE is no longer used in workqueue implementation and will be removed later. * All unbound pwq operations which used to be per-numa-node are now per-cpu. For most unbound workqueue users, this shouldn't cause noticeable changes. Work item issue and completion will be a small bit faster, flush_workqueue() would become a bit more expensive, and the total concurrency limit would likely become higher. All @max_active==1 use cases are currently being audited for conversion into alloc_ordered_workqueue() and they shouldn't be affected once the audit and conversion is complete. One area where the behavior change may be more noticeable is workqueue_congested() as the reported congestion state is now per CPU instead of NUMA node. There are only two users of this interface - drivers/infiniband/hw/hfi1 and net/smc. Maintainers of both subsystems are cc'd. Inputs on the behavior change would be very much appreciated. Signed-off-by: Tejun Heo Acked-by: Dennis Dalessandro Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Karsten Graul Cc: Wenjia Zhang Cc: Jan Karcher --- Documentation/core-api/workqueue.rst | 19 ++- include/linux/workqueue.h | 8 +- kernel/workqueue.c | 218 ++++++++++----------------- 3 files changed, 88 insertions(+), 157 deletions(-) diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index a4c9b9d1905f..8e541c5d8fa9 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -220,17 +220,16 @@ resources, scheduled and executed. ``max_active`` -------------- -``@max_active`` determines the maximum number of execution contexts -per CPU which can be assigned to the work items of a wq. For example, -with ``@max_active`` of 16, at most 16 work items of the wq can be -executing at the same time per CPU. +``@max_active`` determines the maximum number of execution contexts per +CPU which can be assigned to the work items of a wq. For example, with +``@max_active`` of 16, at most 16 work items of the wq can be executing +at the same time per CPU. This is always a per-CPU attribute, even for +unbound workqueues. -Currently, for a bound wq, the maximum limit for ``@max_active`` is -512 and the default value used when 0 is specified is 256. For an -unbound wq, the limit is higher of 512 and 4 * -``num_possible_cpus()``. These values are chosen sufficiently high -such that they are not the limiting factor while providing protection -in runaway cases. +The maximum limit for ``@max_active`` is 512 and the default value used +when 0 is specified is 256. These values are chosen sufficiently high +such that they are not the limiting factor while providing protection in +runaway cases. The number of active work items of a wq is usually regulated by the users of the wq, more specifically, by how many work items the users diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 1a4223cbdb5f..52d1a6225b44 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -343,14 +343,10 @@ enum { __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ - WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ + WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, }; -/* unbound wq's aren't per-cpu, scale max_active according to #cpus */ -#define WQ_UNBOUND_MAX_ACTIVE \ - max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU) - /* * System-wide workqueues which are always present. * @@ -391,7 +387,7 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq; * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags - * @max_active: max in-flight work items, 0 for default + * @max_active: max in-flight work items per CPU, 0 for default * remaining args: args for @fmt * * Allocate a workqueue with the specified parameters. For detailed diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9f4341885f60..48208888aee0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -321,8 +321,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu **cpu_pwq; /* I: per-cpu pwqs */ - struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ + struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ }; static struct kmem_cache *pwq_cache; @@ -608,35 +607,6 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -/** - * unbound_pwq_by_node - return the unbound pool_workqueue for the given node - * @wq: the target workqueue - * @node: the node ID - * - * This must be called with any of wq_pool_mutex, wq->mutex or RCU - * read locked. - * If the pwq needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pwq stays online. - * - * Return: The unbound pool_workqueue for @node. - */ -static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, - int node) -{ - assert_rcu_or_wq_mutex_or_pool_mutex(wq); - - /* - * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a - * delayed item is pending. The plan is to keep CPU -> NODE - * mapping valid and stable across CPU on/offlines. Once that - * happens, this workaround can be removed. - */ - if (unlikely(node == NUMA_NO_NODE)) - return wq->dfl_pwq; - - return rcu_dereference_raw(wq->numa_pwq_tbl[node]); -} - static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -1676,16 +1646,14 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ - if (wq->flags & WQ_UNBOUND) { - if (req_cpu == WORK_CPU_UNBOUND) + if (req_cpu == WORK_CPU_UNBOUND) { + if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); - pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - } else { - if (req_cpu == WORK_CPU_UNBOUND) + else cpu = raw_smp_processor_id(); - pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); } + pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); pool = pwq->pool; /* @@ -1715,12 +1683,11 @@ retry: } /* - * pwq is determined and locked. For unbound pools, we could have - * raced with pwq release and it could already be dead. If its - * refcnt is zero, repeat pwq selection. Note that pwqs never die - * without another pwq replacing it in the numa_pwq_tbl or while - * work items are executing on it, so the retrying is guaranteed to - * make forward-progress. + * pwq is determined and locked. For unbound pools, we could have raced + * with pwq release and it could already be dead. If its refcnt is zero, + * repeat pwq selection. Note that unbound pwqs never die without + * another pwq replacing it in cpu_pwq or while work items are executing + * on it, so the retrying is guaranteed to make forward-progress. */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { @@ -3818,12 +3785,8 @@ static void rcu_free_wq(struct rcu_head *rcu) container_of(rcu, struct workqueue_struct, rcu); wq_free_lockdep(wq); - - if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_pwq); - else - free_workqueue_attrs(wq->unbound_attrs); - + free_percpu(wq->cpu_pwq); + free_workqueue_attrs(wq->unbound_attrs); kfree(wq); } @@ -4174,11 +4137,8 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, * * The caller is responsible for ensuring that the cpumask of @node stays * stable. - * - * Return: %true if the resulting @cpumask is different from @attrs->cpumask, - * %false if equal. */ -static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, +static void wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, int cpu_going_down, cpumask_t *cpumask) { if (!wq_numa_enabled || attrs->no_numa) @@ -4195,23 +4155,18 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, /* yeap, return possible CPUs in @node that @attrs wants */ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); - if (cpumask_empty(cpumask)) { + if (cpumask_empty(cpumask)) pr_warn_once("WARNING: workqueue cpumask: online intersect > " "possible intersect\n"); - return false; - } - - return !cpumask_equal(cpumask, attrs->cpumask); + return; use_dfl: cpumask_copy(cpumask, attrs->cpumask); - return false; } -/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ -static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, - int node, - struct pool_workqueue *pwq) +/* install @pwq into @wq's cpu_pwq and return the old pwq */ +static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, + int cpu, struct pool_workqueue *pwq) { struct pool_workqueue *old_pwq; @@ -4221,8 +4176,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, /* link_pwq() can handle duplicate calls */ link_pwq(pwq); - old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); - rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); + rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq); return old_pwq; } @@ -4239,10 +4194,10 @@ struct apply_wqattrs_ctx { static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) { if (ctx) { - int node; + int cpu; - for_each_node(node) - put_pwq_unlocked(ctx->pwq_tbl[node]); + for_each_possible_cpu(cpu) + put_pwq_unlocked(ctx->pwq_tbl[cpu]); put_pwq_unlocked(ctx->dfl_pwq); free_workqueue_attrs(ctx->attrs); @@ -4259,11 +4214,11 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, { struct apply_wqattrs_ctx *ctx; struct workqueue_attrs *new_attrs, *tmp_attrs; - int node; + int cpu; lockdep_assert_held(&wq_pool_mutex); - ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); + ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(); tmp_attrs = alloc_workqueue_attrs(); @@ -4297,14 +4252,16 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, if (!ctx->dfl_pwq) goto out_free; - for_each_node(node) { - if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) { - ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); - if (!ctx->pwq_tbl[node]) - goto out_free; - } else { + for_each_possible_cpu(cpu) { + if (new_attrs->no_numa) { ctx->dfl_pwq->refcnt++; - ctx->pwq_tbl[node] = ctx->dfl_pwq; + ctx->pwq_tbl[cpu] = ctx->dfl_pwq; + } else { + wq_calc_node_cpumask(new_attrs, cpu_to_node(cpu), -1, + tmp_attrs->cpumask); + ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, tmp_attrs); + if (!ctx->pwq_tbl[cpu]) + goto out_free; } } @@ -4327,7 +4284,7 @@ out_free: /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) { - int node; + int cpu; /* all pwqs have been created successfully, let's install'em */ mutex_lock(&ctx->wq->mutex); @@ -4335,9 +4292,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); /* save the previous pwq and install the new one */ - for_each_node(node) - ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node, - ctx->pwq_tbl[node]); + for_each_possible_cpu(cpu) + ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, + ctx->pwq_tbl[cpu]); /* @dfl_pwq might not have been used, ensure it's linked */ link_pwq(ctx->dfl_pwq); @@ -4466,20 +4423,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, cpumask = target_attrs->cpumask; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); - pwq = unbound_pwq_by_node(wq, node); - /* - * Let's determine what needs to be done. If the target cpumask is - * different from the default pwq's, we need to compare it to @pwq's - * and create a new one if they don't match. If the target cpumask - * equals the default pwq's, the default pwq should be used. - */ - if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, off_cpu, cpumask)) { - if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) - return; - } else { - goto use_dfl_pwq; - } + /* nothing to do if the target cpumask matches the current pwq */ + wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, off_cpu, cpumask); + pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), + lockdep_is_held(&wq_pool_mutex)); + if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) + return; /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); @@ -4491,7 +4441,7 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, /* Install the new pwq. */ mutex_lock(&wq->mutex); - old_pwq = numa_pwq_tbl_install(wq, node, pwq); + old_pwq = install_unbound_pwq(wq, cpu, pwq); goto out_unlock; use_dfl_pwq: @@ -4499,7 +4449,7 @@ use_dfl_pwq: raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); - old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); + old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); put_pwq_unlocked(old_pwq); @@ -4510,11 +4460,11 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) bool highpri = wq->flags & WQ_HIGHPRI; int cpu, ret; - if (!(wq->flags & WQ_UNBOUND)) { - wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); - if (!wq->cpu_pwq) - goto enomem; + wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); + if (!wq->cpu_pwq) + goto enomem; + if (!(wq->flags & WQ_UNBOUND)) { for_each_possible_cpu(cpu) { struct pool_workqueue **pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); @@ -4562,13 +4512,11 @@ enomem: static int wq_clamp_max_active(int max_active, unsigned int flags, const char *name) { - int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; - - if (max_active < 1 || max_active > lim) + if (max_active < 1 || max_active > WQ_MAX_ACTIVE) pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", - max_active, name, 1, lim); + max_active, name, 1, WQ_MAX_ACTIVE); - return clamp_val(max_active, 1, lim); + return clamp_val(max_active, 1, WQ_MAX_ACTIVE); } /* @@ -4612,7 +4560,6 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...) { - size_t tbl_size = 0; va_list args; struct workqueue_struct *wq; struct pool_workqueue *pwq; @@ -4632,10 +4579,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, flags |= WQ_UNBOUND; /* allocate wq and format name */ - if (flags & WQ_UNBOUND) - tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]); - - wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; @@ -4730,7 +4674,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; - int cpu, node; + int cpu; /* * Remove it from sysfs first so that sanity check failure doesn't @@ -4789,29 +4733,23 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); - if (!(wq->flags & WQ_UNBOUND)) { - for_each_possible_cpu(cpu) - put_pwq_unlocked(*per_cpu_ptr(wq->cpu_pwq, cpu)); - } else { - /* - * We're the sole accessor of @wq at this point. Directly - * access numa_pwq_tbl[] and dfl_pwq to put the base refs. - * @wq will be freed when the last pwq is released. - */ - for_each_node(node) { - pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); - RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); - put_pwq_unlocked(pwq); - } + /* + * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq + * to put the base refs. @wq will be auto-destroyed from the last + * pwq_put. RCU read lock prevents @wq from going away from under us. + */ + rcu_read_lock(); - /* - * Put dfl_pwq. @wq may be freed any time after dfl_pwq is - * put. Don't access it afterwards. - */ - pwq = wq->dfl_pwq; - wq->dfl_pwq = NULL; + for_each_possible_cpu(cpu) { + pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); + RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL); put_pwq_unlocked(pwq); } + + put_pwq_unlocked(wq->dfl_pwq); + wq->dfl_pwq = NULL; + + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(destroy_workqueue); @@ -4888,10 +4826,11 @@ bool current_is_workqueue_rescuer(void) * unreliable and only useful as advisory hints or for debugging. * * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. - * Note that both per-cpu and unbound workqueues may be associated with - * multiple pool_workqueues which have separate congested states. A - * workqueue being congested on one CPU doesn't mean the workqueue is also - * contested on other CPUs / NUMA nodes. + * + * With the exception of ordered workqueues, all workqueues have per-cpu + * pool_workqueues, each with its own congested state. A workqueue being + * congested on one CPU doesn't mean that the workqueue is contested on any + * other CPUs. * * Return: * %true if congested, %false otherwise. @@ -4907,12 +4846,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); - if (!(wq->flags & WQ_UNBOUND)) - pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); - else - pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - + pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); ret = !list_empty(&pwq->inactive_works); + preempt_enable(); rcu_read_unlock(); @@ -6434,7 +6370,7 @@ void __init workqueue_init_early(void) system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, - WQ_UNBOUND_MAX_ACTIVE); + WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", From af73f5c9febe5095ee492ae43e9898fca65ced70 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: [PATCH 14/31] workqueue: Rename workqueue_attrs->no_numa to ->ordered With the recent removal of NUMA related module param and sysfs knob, workqueue_attrs->no_numa is now only used to implement ordered workqueues. Let's rename the field so that it's less confusing especially with the planned CPU affinity awareness improvements. Just a rename. No functional changes. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 6 +++--- kernel/workqueue.c | 19 +++++++++---------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 52d1a6225b44..f0c10f491b15 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -142,13 +142,13 @@ struct workqueue_attrs { cpumask_var_t cpumask; /** - * @no_numa: disable NUMA affinity + * @ordered: work items must be executed one by one in queueing order * - * Unlike other fields, ``no_numa`` isn't a property of a worker_pool. It + * Unlike other fields, ``ordered`` isn't a property of a worker_pool. It * only modifies how :c:func:`apply_workqueue_attrs` select pools and thus * doesn't participate in pool hash calculations or equality comparisons. */ - bool no_numa; + bool ordered; }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 48208888aee0..82413df1c120 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3672,10 +3672,10 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, cpumask_copy(to->cpumask, from->cpumask); /* * Unlike hash and equality test, this function doesn't ignore - * ->no_numa as it is used for both pool and wq attrs. Instead, - * get_unbound_pool() explicitly clears ->no_numa after copying. + * ->ordered as it is used for both pool and wq attrs. Instead, + * get_unbound_pool() explicitly clears ->ordered after copying. */ - to->no_numa = from->no_numa; + to->ordered = from->ordered; } /* hash value of the content of @attr */ @@ -3933,10 +3933,10 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) pool->node = target_node; /* - * no_numa isn't a worker_pool attribute, always clear it. See + * ordered isn't a worker_pool attribute, always clear it. See * 'struct workqueue_attrs' comments for detail. */ - pool->attrs->no_numa = false; + pool->attrs->ordered = false; if (worker_pool_assign_id(pool) < 0) goto fail; @@ -4141,7 +4141,7 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, static void wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, int cpu_going_down, cpumask_t *cpumask) { - if (!wq_numa_enabled || attrs->no_numa) + if (!wq_numa_enabled || attrs->ordered) goto use_dfl; /* does @node have any online CPUs @attrs wants? */ @@ -4253,7 +4253,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, goto out_free; for_each_possible_cpu(cpu) { - if (new_attrs->no_numa) { + if (new_attrs->ordered) { ctx->dfl_pwq->refcnt++; ctx->pwq_tbl[cpu] = ctx->dfl_pwq; } else { @@ -4411,7 +4411,7 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, lockdep_assert_held(&wq_pool_mutex); if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) || - wq->unbound_attrs->no_numa) + wq->unbound_attrs->ordered) return; /* @@ -6358,11 +6358,10 @@ void __init workqueue_init_early(void) /* * An ordered wq should have only one pwq as ordering is * guaranteed by max_active which is enforced by pwqs. - * Turn off NUMA so that dfl_pwq is used for all nodes. */ BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; - attrs->no_numa = true; + attrs->ordered = true; ordered_wq_attrs[i] = attrs; } From fef59c9cab6ac5592da54f6c2b1195418f14e4d0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: [PATCH 15/31] workqueue: Rename NUMA related names to use pod instead Workqueue is in the process of improving CPU affinity awareness. It will become more flexible and won't be tied to NUMA node boundaries. This patch renames all NUMA related names in workqueue.c to use "pod" instead. While "pod" isn't a very common term, it short and captures the grouping of CPUs well enough. These names are only going to be used within workqueue implementation proper, so the specific naming doesn't matter that much. * wq_numa_possible_cpumask -> wq_pod_cpus * wq_numa_enabled -> wq_pod_enabled * wq_update_unbound_numa_attrs_buf -> wq_update_pod_attrs_buf * workqueue_select_cpu_near -> select_numa_node_cpu This rename is different from others. The function is only used by queue_work_node() and specifically tries to find a CPU in the specified NUMA node. As workqueue affinity will become more flexible and untied from NUMA, this function's name should specifically describe that it's for NUMA. * wq_calc_node_cpumask -> wq_calc_pod_cpumask * wq_update_unbound_numa -> wq_update_pod * wq_numa_init -> wq_pod_init * node -> pod in local variables Only renames. No functional changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 159 +++++++++++++++++++++------------------------ 1 file changed, 75 insertions(+), 84 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 82413df1c120..8a484f606e74 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -326,8 +326,7 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -static cpumask_var_t *wq_numa_possible_cpumask; - /* possible CPUs of each node */ +static cpumask_var_t *wq_pod_cpus; /* possible CPUs of each node */ /* * Per-cpu work items which run for longer than the following threshold are @@ -345,10 +344,10 @@ module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_online; /* can kworkers be created yet? */ -static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ +static bool wq_pod_enabled; /* unbound CPU pod affinity enabled */ -/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ -static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; +/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ +static struct workqueue_attrs *wq_update_pod_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ @@ -1762,7 +1761,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, EXPORT_SYMBOL(queue_work_on); /** - * workqueue_select_cpu_near - Select a CPU based on NUMA node + * select_numa_node_cpu - Select a CPU based on NUMA node * @node: NUMA node ID that we want to select a CPU from * * This function will attempt to find a "random" cpu available on a given @@ -1770,12 +1769,12 @@ EXPORT_SYMBOL(queue_work_on); * WORK_CPU_UNBOUND indicating that we should just schedule to any * available CPU if we need to schedule this work. */ -static int workqueue_select_cpu_near(int node) +static int select_numa_node_cpu(int node) { int cpu; /* No point in doing this if NUMA isn't enabled for workqueues */ - if (!wq_numa_enabled) + if (!wq_pod_enabled) return WORK_CPU_UNBOUND; /* Delay binding to CPU if node is not valid or online */ @@ -1834,7 +1833,7 @@ bool queue_work_node(int node, struct workqueue_struct *wq, local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - int cpu = workqueue_select_cpu_near(node); + int cpu = select_numa_node_cpu(node); __queue_work(cpu, wq, work); ret = true; @@ -3900,8 +3899,8 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - int node; - int target_node = NUMA_NO_NODE; + int pod; + int target_pod = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); @@ -3913,24 +3912,23 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) } } - /* if cpumask is contained inside a NUMA node, we belong to that node */ - if (wq_numa_enabled) { - for_each_node(node) { - if (cpumask_subset(attrs->cpumask, - wq_numa_possible_cpumask[node])) { - target_node = node; + /* if cpumask is contained inside a pod, we belong to that pod */ + if (wq_pod_enabled) { + for_each_node(pod) { + if (cpumask_subset(attrs->cpumask, wq_pod_cpus[pod])) { + target_pod = pod; break; } } } /* nope, create a new one */ - pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node); + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_pod); if (!pool || init_worker_pool(pool) < 0) goto fail; copy_workqueue_attrs(pool->attrs, attrs); - pool->node = target_node; + pool->node = target_pod; /* * ordered isn't a worker_pool attribute, always clear it. See @@ -4120,40 +4118,38 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, } /** - * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node + * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod * @attrs: the wq_attrs of the default pwq of the target workqueue - * @node: the target NUMA node + * @pod: the target CPU pod * @cpu_going_down: if >= 0, the CPU to consider as offline * @cpumask: outarg, the resulting cpumask * - * Calculate the cpumask a workqueue with @attrs should use on @node. If - * @cpu_going_down is >= 0, that cpu is considered offline during - * calculation. The result is stored in @cpumask. + * Calculate the cpumask a workqueue with @attrs should use on @pod. If + * @cpu_going_down is >= 0, that cpu is considered offline during calculation. + * The result is stored in @cpumask. * - * If NUMA affinity is not enabled, @attrs->cpumask is always used. If - * enabled and @node has online CPUs requested by @attrs, the returned - * cpumask is the intersection of the possible CPUs of @node and - * @attrs->cpumask. + * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled + * and @pod has online CPUs requested by @attrs, the returned cpumask is the + * intersection of the possible CPUs of @pod and @attrs->cpumask. * - * The caller is responsible for ensuring that the cpumask of @node stays - * stable. + * The caller is responsible for ensuring that the cpumask of @pod stays stable. */ -static void wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, +static void wq_calc_pod_cpumask(const struct workqueue_attrs *attrs, int pod, int cpu_going_down, cpumask_t *cpumask) { - if (!wq_numa_enabled || attrs->ordered) + if (!wq_pod_enabled || attrs->ordered) goto use_dfl; - /* does @node have any online CPUs @attrs wants? */ - cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); + /* does @pod have any online CPUs @attrs wants? */ + cpumask_and(cpumask, cpumask_of_node(pod), attrs->cpumask); if (cpu_going_down >= 0) cpumask_clear_cpu(cpu_going_down, cpumask); if (cpumask_empty(cpumask)) goto use_dfl; - /* yeap, return possible CPUs in @node that @attrs wants */ - cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + /* yeap, return possible CPUs in @pod that @attrs wants */ + cpumask_and(cpumask, attrs->cpumask, wq_pod_cpus[pod]); if (cpumask_empty(cpumask)) pr_warn_once("WARNING: workqueue cpumask: online intersect > " @@ -4257,8 +4253,8 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, ctx->dfl_pwq->refcnt++; ctx->pwq_tbl[cpu] = ctx->dfl_pwq; } else { - wq_calc_node_cpumask(new_attrs, cpu_to_node(cpu), -1, - tmp_attrs->cpumask); + wq_calc_pod_cpumask(new_attrs, cpu_to_node(cpu), -1, + tmp_attrs->cpumask); ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, tmp_attrs); if (!ctx->pwq_tbl[cpu]) goto out_free; @@ -4349,12 +4345,11 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * @wq: the target workqueue * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() * - * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA - * machines, this function maps a separate pwq to each NUMA node with - * possibles CPUs in @attrs->cpumask so that work items are affine to the - * NUMA node it was issued on. Older pwqs are released as in-flight work - * items finish. Note that a work item which repeatedly requeues itself - * back-to-back will stay on its current pwq. + * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps + * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that + * work items are affine to the pod it was issued on. Older pwqs are released as + * in-flight work items finish. Note that a work item which repeatedly requeues + * itself back-to-back will stay on its current pwq. * * Performs GFP_KERNEL allocations. * @@ -4377,32 +4372,31 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, } /** - * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug + * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug * @wq: the target workqueue * @cpu: the CPU to update pool association for * @hotplug_cpu: the CPU coming up or going down * @online: whether @cpu is coming up or going down * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and - * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of + * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of * @wq accordingly. * - * If NUMA affinity can't be adjusted due to memory allocation failure, it - * falls back to @wq->dfl_pwq which may not be optimal but is always - * correct. * - * Note that when the last allowed CPU of a NUMA node goes offline for a - * workqueue with a cpumask spanning multiple nodes, the workers which were - * already executing the work items for the workqueue will lose their CPU - * affinity and may execute on any CPU. This is similar to how per-cpu - * workqueues behave on CPU_DOWN. If a workqueue user wants strict - * affinity, it's the user's responsibility to flush the work item from - * CPU_DOWN_PREPARE. + * If pod affinity can't be adjusted due to memory allocation failure, it falls + * back to @wq->dfl_pwq which may not be optimal but is always correct. + * + * Note that when the last allowed CPU of a pod goes offline for a workqueue + * with a cpumask spanning multiple pods, the workers which were already + * executing the work items for the workqueue will lose their CPU affinity and + * may execute on any CPU. This is similar to how per-cpu workqueues behave on + * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's + * responsibility to flush the work item from CPU_DOWN_PREPARE. */ -static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, - int hotplug_cpu, bool online) +static void wq_update_pod(struct workqueue_struct *wq, int cpu, + int hotplug_cpu, bool online) { - int node = cpu_to_node(cpu); + int pod = cpu_to_node(cpu); int off_cpu = online ? -1 : hotplug_cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; @@ -4410,7 +4404,7 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, lockdep_assert_held(&wq_pool_mutex); - if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) || + if (!wq_pod_enabled || !(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered) return; @@ -4419,13 +4413,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, * Let's use a preallocated one. The following buf is protected by * CPU hotplug exclusion. */ - target_attrs = wq_update_unbound_numa_attrs_buf; + target_attrs = wq_update_pod_attrs_buf; cpumask = target_attrs->cpumask; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); /* nothing to do if the target cpumask matches the current pwq */ - wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, off_cpu, cpumask); + wq_calc_pod_cpumask(wq->dfl_pwq->pool->attrs, pod, off_cpu, cpumask); pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), lockdep_is_held(&wq_pool_mutex)); if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) @@ -4434,7 +4428,7 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { - pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n", wq->name); goto use_dfl_pwq; } @@ -4565,11 +4559,10 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, struct pool_workqueue *pwq; /* - * Unbound && max_active == 1 used to imply ordered, which is no - * longer the case on NUMA machines due to per-node pools. While + * Unbound && max_active == 1 used to imply ordered, which is no longer + * the case on many machines due to per-pod pools. While * alloc_ordered_workqueue() is the right way to create an ordered - * workqueue, keep the previous behavior to avoid subtle breakages - * on NUMA. + * workqueue, keep the previous behavior to avoid subtle breakages. */ if ((flags & WQ_UNBOUND) && max_active == 1) flags |= __WQ_ORDERED; @@ -5450,13 +5443,13 @@ int workqueue_online_cpu(unsigned int cpu) mutex_unlock(&wq_pool_attach_mutex); } - /* update NUMA affinity of unbound workqueues */ + /* update pod affinity of unbound workqueues */ list_for_each_entry(wq, &workqueues, list) { int tcpu; for_each_possible_cpu(tcpu) { if (cpu_to_node(tcpu) == cpu_to_node(cpu)) { - wq_update_unbound_numa(wq, tcpu, cpu, true); + wq_update_pod(wq, tcpu, cpu, true); } } } @@ -5475,14 +5468,14 @@ int workqueue_offline_cpu(unsigned int cpu) unbind_workers(cpu); - /* update NUMA affinity of unbound workqueues */ + /* update pod affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); list_for_each_entry(wq, &workqueues, list) { int tcpu; for_each_possible_cpu(tcpu) { if (cpu_to_node(tcpu) == cpu_to_node(cpu)) { - wq_update_unbound_numa(wq, tcpu, cpu, false); + wq_update_pod(wq, tcpu, cpu, false); } } } @@ -6263,7 +6256,7 @@ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ -static void __init wq_numa_init(void) +static void __init wq_pod_init(void) { cpumask_var_t *tbl; int node, cpu; @@ -6278,8 +6271,8 @@ static void __init wq_numa_init(void) } } - wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); - BUG_ON(!wq_update_unbound_numa_attrs_buf); + wq_update_pod_attrs_buf = alloc_workqueue_attrs(); + BUG_ON(!wq_update_pod_attrs_buf); /* * We want masks of possible CPUs of each node which isn't readily @@ -6298,8 +6291,8 @@ static void __init wq_numa_init(void) cpumask_set_cpu(cpu, tbl[node]); } - wq_numa_possible_cpumask = tbl; - wq_numa_enabled = true; + wq_pod_cpus = tbl; + wq_pod_enabled = true; } /** @@ -6440,15 +6433,14 @@ void __init workqueue_init(void) wq_cpu_intensive_thresh_init(); /* - * It'd be simpler to initialize NUMA in workqueue_init_early() but - * CPU to node mapping may not be available that early on some - * archs such as power and arm64. As per-cpu pools created - * previously could be missing node hint and unbound pools NUMA - * affinity, fix them up. + * It'd be simpler to initialize pods in workqueue_init_early() but CPU + * to node mapping may not be available that early on some archs such as + * power and arm64. As per-cpu pools created previously could be missing + * node hint and unbound pool pod affinity, fix them up. * * Also, while iterating workqueues, create rescuers if requested. */ - wq_numa_init(); + wq_pod_init(); mutex_lock(&wq_pool_mutex); @@ -6459,8 +6451,7 @@ void __init workqueue_init(void) } list_for_each_entry(wq, &workqueues, list) { - wq_update_unbound_numa(wq, smp_processor_id(), smp_processor_id(), - true); + wq_update_pod(wq, smp_processor_id(), smp_processor_id(), true); WARN(init_rescuer(wq), "workqueue: failed to create early rescuer for %s", wq->name); From a86feae6195ac2148097b063f7fdad8ee1f6dad4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:24 -1000 Subject: [PATCH 16/31] workqueue: Move wq_pod_init() below workqueue_init() wq_pod_init() is called from workqueue_init() and responsible for initializing unbound CPU pods according to NUMA node. Workqueue is in the process of improving affinity awareness and wants to use other topology information to initialize unbound CPU pods; however, unlike NUMA nodes, other topology information isn't yet available in workqueue_init(). The next patch will introduce a later stage init function for workqueue which will be responsible for initializing unbound CPU pods. Relocate wq_pod_init() below workqueue_init() where the new init function is going to be located so that the diff can show the content differences. Just a relocation. No functional changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 78 ++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8a484f606e74..1e528b7e12c5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6256,44 +6256,7 @@ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ -static void __init wq_pod_init(void) -{ - cpumask_var_t *tbl; - int node, cpu; - - if (num_possible_nodes() <= 1) - return; - - for_each_possible_cpu(cpu) { - if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { - pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); - return; - } - } - - wq_update_pod_attrs_buf = alloc_workqueue_attrs(); - BUG_ON(!wq_update_pod_attrs_buf); - - /* - * We want masks of possible CPUs of each node which isn't readily - * available. Build one from cpu_to_node() which should have been - * fully initialized by now. - */ - tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL); - BUG_ON(!tbl); - - for_each_node(node) - BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, - node_online(node) ? node : NUMA_NO_NODE)); - - for_each_possible_cpu(cpu) { - node = cpu_to_node(cpu); - cpumask_set_cpu(cpu, tbl[node]); - } - - wq_pod_cpus = tbl; - wq_pod_enabled = true; -} +static void wq_pod_init(void); /** * workqueue_init_early - early init for workqueue subsystem @@ -6474,6 +6437,45 @@ void __init workqueue_init(void) wq_watchdog_init(); } +static void __init wq_pod_init(void) +{ + cpumask_var_t *tbl; + int node, cpu; + + if (num_possible_nodes() <= 1) + return; + + for_each_possible_cpu(cpu) { + if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { + pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); + return; + } + } + + wq_update_pod_attrs_buf = alloc_workqueue_attrs(); + BUG_ON(!wq_update_pod_attrs_buf); + + /* + * We want masks of possible CPUs of each node which isn't readily + * available. Build one from cpu_to_node() which should have been + * fully initialized by now. + */ + tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL); + BUG_ON(!tbl); + + for_each_node(node) + BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, + node_online(node) ? node : NUMA_NO_NODE)); + + for_each_possible_cpu(cpu) { + node = cpu_to_node(cpu); + cpumask_set_cpu(cpu, tbl[node]); + } + + wq_pod_cpus = tbl; + wq_pod_enabled = true; +} + void __warn_flushing_systemwide_wq(void) { pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n"); From 2930155b2e27232c033970f2e110aaac4187cb9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:24 -1000 Subject: [PATCH 17/31] workqueue: Initialize unbound CPU pods later in the boot During boot, to initialize unbound CPU pods, wq_pod_init() was called from workqueue_init(). This is early enough for NUMA nodes to be set up but before SMP is brought up and CPU topology information is populated. Workqueue is in the process of improving CPU locality for unbound workqueues and will need access to topology information during pod init. This adds a new init function workqueue_init_topology() which is called after CPU topology information is available and replaces wq_pod_init(). As unbound CPU pods are now initialized after workqueues are activated, we need to revisit the workqueues to apply the pod configuration. Workqueues which are created before workqueue_init_topology() are set up so that they always use the default worker pool. After pods are set up in workqueue_init_topology(), wq_update_pod() is called on all existing workqueues to update the pool associations accordingly. Note that wq_update_pod_attrs_buf allocation is moved to workqueue_init_early(). This isn't necessary right now but enables further generalization of pod handling in the future. This patch changes the initialization sequence but the end result should be the same. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 1 + init/main.c | 1 + kernel/workqueue.c | 68 +++++++++++++++++++++++---------------- 3 files changed, 43 insertions(+), 27 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index f0c10f491b15..bab9fa3453ed 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -672,5 +672,6 @@ int workqueue_offline_cpu(unsigned int cpu); void __init workqueue_init_early(void); void __init workqueue_init(void); +void __init workqueue_init_topology(void); #endif diff --git a/init/main.c b/init/main.c index ad920fac325c..436d73261810 100644 --- a/init/main.c +++ b/init/main.c @@ -1540,6 +1540,7 @@ static noinline void __init kernel_init_freeable(void) smp_init(); sched_init_smp(); + workqueue_init_topology(); padata_init(); page_alloc_init_late(); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1e528b7e12c5..5914c820a4f1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6256,17 +6256,15 @@ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ -static void wq_pod_init(void); - /** * workqueue_init_early - early init for workqueue subsystem * - * This is the first half of two-staged workqueue subsystem initialization - * and invoked as soon as the bare basics - memory allocation, cpumasks and - * idr are up. It sets up all the data structures and system workqueues - * and allows early boot code to create workqueues and queue/cancel work - * items. Actual work item execution starts only after kthreads can be - * created and scheduled right before early initcalls. + * This is the first step of three-staged workqueue subsystem initialization and + * invoked as soon as the bare basics - memory allocation, cpumasks and idr are + * up. It sets up all the data structures and system workqueues and allows early + * boot code to create workqueues and queue/cancel work items. Actual work item + * execution starts only after kthreads can be created and scheduled right + * before early initcalls. */ void __init workqueue_init_early(void) { @@ -6284,6 +6282,9 @@ void __init workqueue_init_early(void) pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); + wq_update_pod_attrs_buf = alloc_workqueue_attrs(); + BUG_ON(!wq_update_pod_attrs_buf); + /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; @@ -6381,11 +6382,11 @@ static void __init wq_cpu_intensive_thresh_init(void) /** * workqueue_init - bring workqueue subsystem fully online * - * This is the latter half of two-staged workqueue subsystem initialization - * and invoked as soon as kthreads can be created and scheduled. - * Workqueues have been created and work items queued on them, but there - * are no kworkers executing the work items yet. Populate the worker pools - * with the initial workers and enable future kworker creations. + * This is the second step of three-staged workqueue subsystem initialization + * and invoked as soon as kthreads can be created and scheduled. Workqueues have + * been created and work items queued on them, but there are no kworkers + * executing the work items yet. Populate the worker pools with the initial + * workers and enable future kworker creations. */ void __init workqueue_init(void) { @@ -6395,18 +6396,12 @@ void __init workqueue_init(void) wq_cpu_intensive_thresh_init(); - /* - * It'd be simpler to initialize pods in workqueue_init_early() but CPU - * to node mapping may not be available that early on some archs such as - * power and arm64. As per-cpu pools created previously could be missing - * node hint and unbound pool pod affinity, fix them up. - * - * Also, while iterating workqueues, create rescuers if requested. - */ - wq_pod_init(); - mutex_lock(&wq_pool_mutex); + /* + * Per-cpu pools created earlier could be missing node hint. Fix them + * up. Also, create a rescuer for workqueues that requested it. + */ for_each_possible_cpu(cpu) { for_each_cpu_worker_pool(pool, cpu) { pool->node = cpu_to_node(cpu); @@ -6414,7 +6409,6 @@ void __init workqueue_init(void) } list_for_each_entry(wq, &workqueues, list) { - wq_update_pod(wq, smp_processor_id(), smp_processor_id(), true); WARN(init_rescuer(wq), "workqueue: failed to create early rescuer for %s", wq->name); @@ -6437,8 +6431,16 @@ void __init workqueue_init(void) wq_watchdog_init(); } -static void __init wq_pod_init(void) +/** + * workqueue_init_topology - initialize CPU pods for unbound workqueues + * + * This is the third step of there-staged workqueue subsystem initialization and + * invoked after SMP and topology information are fully initialized. It + * initializes the unbound CPU pods accordingly. + */ +void __init workqueue_init_topology(void) { + struct workqueue_struct *wq; cpumask_var_t *tbl; int node, cpu; @@ -6452,8 +6454,7 @@ static void __init wq_pod_init(void) } } - wq_update_pod_attrs_buf = alloc_workqueue_attrs(); - BUG_ON(!wq_update_pod_attrs_buf); + mutex_lock(&wq_pool_mutex); /* * We want masks of possible CPUs of each node which isn't readily @@ -6474,6 +6475,19 @@ static void __init wq_pod_init(void) wq_pod_cpus = tbl; wq_pod_enabled = true; + + /* + * Workqueues allocated earlier would have all CPUs sharing the default + * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU + * combinations to apply per-pod sharing. + */ + list_for_each_entry(wq, &workqueues, list) { + for_each_online_cpu(cpu) { + wq_update_pod(wq, cpu, cpu, true); + } + } + + mutex_unlock(&wq_pool_mutex); } void __warn_flushing_systemwide_wq(void) From 0f36ee24cd43c67be07166ddd09866dc7a47cb4c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:24 -1000 Subject: [PATCH 18/31] workqueue: Factor out actual cpumask calculation to reduce subtlety in wq_update_pod() For an unbound pool, multiple cpumasks are involved. U: The user-specified cpumask (may be filtered with cpu_possible_mask). A: The actual cpumask filtered by wq_unbound_cpumask. If the filtering leaves no CPU, wq_unbound_cpumask is used. P: Per-pod subsets of #A. wq->attrs stores #U, wq->dfl_pwq->pool->attrs->cpumask #A, and wq->cpu_pwq[CPU]->pool->attrs->cpumask #P. wq_update_pod() is called to update per-pod pwq's during CPU hotplug. To calculate the new #P for each workqueue, it needs to call wq_calc_pod_cpumask() with @attrs that contains #A. Currently, wq_update_pod() achieves this by calling wq_calc_pod_cpumask() with wq->dfl_pwq->pool->attrs. This is rather fragile because we're calling wq_calc_pod_cpumask() with @attrs of a worker_pool rather than the workqueue's actual attrs when what we want to calculate is the workqueue's cpumask on the pod. While this works fine currently, future changes will add fields which are used differently between workqueues and worker_pools and this subtlety will bite us. This patch factors out #U -> #A calculation from apply_wqattrs_prepare() into wqattrs_actualize_cpumask and updates wq_update_pod() to copy wq->unbound_attrs and use the new helper to obtain #A freshly instead of abusing wq->dfl_pwq->pool_attrs. This shouldn't cause any behavior changes in the current code. Signed-off-by: Tejun Heo Reported-by: K Prateek Nayak Reference: http://lkml.kernel.org/r/30625cdd-4d61-594b-8db9-6816b017dde3@amd.com --- kernel/workqueue.c | 49 +++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5914c820a4f1..3f1fffa8e6d4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -348,6 +348,7 @@ static bool wq_pod_enabled; /* unbound CPU pod affinity enabled */ /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ static struct workqueue_attrs *wq_update_pod_attrs_buf; +static cpumask_var_t wq_update_pod_cpumask_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ @@ -3699,6 +3700,20 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, return true; } +/* Update @attrs with actually available CPUs */ +static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs, + const cpumask_t *unbound_cpumask) +{ + /* + * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If + * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to + * @unbound_cpumask. + */ + cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask); + if (unlikely(cpumask_empty(attrs->cpumask))) + cpumask_copy(attrs->cpumask, unbound_cpumask); +} + /** * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize @@ -4221,33 +4236,23 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, if (!ctx || !new_attrs || !tmp_attrs) goto out_free; - /* - * Calculate the attrs of the default pwq with unbound_cpumask - * which is wq_unbound_cpumask or to set to wq_unbound_cpumask. - * If the user configured cpumask doesn't overlap with the - * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. - */ - copy_workqueue_attrs(new_attrs, attrs); - cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask); - if (unlikely(cpumask_empty(new_attrs->cpumask))) - cpumask_copy(new_attrs->cpumask, unbound_cpumask); - - /* - * We may create multiple pwqs with differing cpumasks. Make a - * copy of @new_attrs which will be modified and used to obtain - * pools. - */ - copy_workqueue_attrs(tmp_attrs, new_attrs); - /* * If something goes wrong during CPU up/down, we'll fall back to * the default pwq covering whole @attrs->cpumask. Always create * it even if we don't use it immediately. */ + copy_workqueue_attrs(new_attrs, attrs); + wqattrs_actualize_cpumask(new_attrs, unbound_cpumask); ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); if (!ctx->dfl_pwq) goto out_free; + /* + * We may create multiple pwqs with differing cpumasks. Make a copy of + * @new_attrs which will be modified and used to obtain pools. + */ + copy_workqueue_attrs(tmp_attrs, new_attrs); + for_each_possible_cpu(cpu) { if (new_attrs->ordered) { ctx->dfl_pwq->refcnt++; @@ -4414,18 +4419,20 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu, * CPU hotplug exclusion. */ target_attrs = wq_update_pod_attrs_buf; - cpumask = target_attrs->cpumask; + cpumask = wq_update_pod_cpumask_buf; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); + wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); /* nothing to do if the target cpumask matches the current pwq */ - wq_calc_pod_cpumask(wq->dfl_pwq->pool->attrs, pod, off_cpu, cpumask); + wq_calc_pod_cpumask(target_attrs, pod, off_cpu, cpumask); pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), lockdep_is_held(&wq_pool_mutex)); if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) return; /* create a new pwq */ + cpumask_copy(target_attrs->cpumask, cpumask); pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n", @@ -6285,6 +6292,8 @@ void __init workqueue_init_early(void) wq_update_pod_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_pod_attrs_buf); + BUG_ON(!alloc_cpumask_var(&wq_update_pod_cpumask_buf, GFP_KERNEL)); + /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; From 5de7a03cac14765ba22934b6fb1476456ee36bf8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:24 -1000 Subject: [PATCH 19/31] workqueue: Factor out clearing of workqueue-only attrs fields workqueue_attrs can be used for both workqueues and worker_pools. However, some fields, currently only ->ordered, only apply to workqueues and should be cleared to the default / invalid values. Currently, an unbound workqueue explicitly clears attrs->ordered in get_unbound_pool() after copying the source workqueue attrs, while per-cpu workqueues rely on the fact that zeroing on allocation gives us the desired default value for pool->attrs->ordered. This is fragile. Let's add wqattrs_clear_for_pool() which clears attrs->ordered and is called from both init_worker_pool() and get_unbound_pool(). This will ease adding more workqueue-only attrs fields. In get_unbound_pool(), pool->node initialization is moved upwards for readability. This shouldn't cause any behavior changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3f1fffa8e6d4..37eab7a1587d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3678,6 +3678,15 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, to->ordered = from->ordered; } +/* + * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the + * comments in 'struct workqueue_attrs' definition. + */ +static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) +{ + attrs->ordered = false; +} + /* hash value of the content of @attr */ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) { @@ -3752,6 +3761,9 @@ static int init_worker_pool(struct worker_pool *pool) pool->attrs = alloc_workqueue_attrs(); if (!pool->attrs) return -ENOMEM; + + wqattrs_clear_for_pool(pool->attrs); + return 0; } @@ -3942,14 +3954,10 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) if (!pool || init_worker_pool(pool) < 0) goto fail; - copy_workqueue_attrs(pool->attrs, attrs); pool->node = target_pod; - /* - * ordered isn't a worker_pool attribute, always clear it. See - * 'struct workqueue_attrs' comments for detail. - */ - pool->attrs->ordered = false; + copy_workqueue_attrs(pool->attrs, attrs); + wqattrs_clear_for_pool(pool->attrs); if (worker_pool_assign_id(pool) < 0) goto fail; From 84193c07105c62d206fb230b2f29002226628989 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:24 -1000 Subject: [PATCH 20/31] workqueue: Generalize unbound CPU pods While renamed to pod, the code still assumes that the pods are defined by NUMA boundaries. Let's generalize it: * workqueue_attrs->affn_scope is added. Each enum represents the type of boundaries that define the pods. There are currently two scopes - WQ_AFFN_NUMA and WQ_AFFN_SYSTEM. The former is the same behavior as before - one pod per NUMA node. The latter defines one global pod across the whole system. * struct wq_pod_type is added which describes how pods are configured for each affnity scope. For each pod, it lists the member CPUs and the preferred NUMA node for memory allocations. The reverse mapping from CPU to pod is also available. * wq_pod_enabled is dropped. Pod is now always enabled. The previously disabled behavior is now implemented through WQ_AFFN_SYSTEM. * get_unbound_pool() wants to determine the NUMA node to allocate memory from for the new pool. The variables are renamed from node to pod but the logic still assumes they're one and the same. Clearly distinguish them - walk the WQ_AFFN_NUMA pods to find the matching pod and then use the pod's NUMA node. * wq_calc_pod_cpumask() was taking @pod but assumed that it was the NUMA node. Take @cpu instead and determine the cpumask to use from the pod_type matching @attrs. * apply_wqattrs_prepare() is update to return ERR_PTR() on error instead of NULL so that it can indicate -EINVAL on invalid affinity scopes. This patch allows CPUs to be grouped into pods however desired per type. While this patch causes some internal behavior changes, nothing material should change for workqueue users. v2: Trigger WARN_ON_ONCE() in wqattrs_pod_type() if affn_scope is WQ_AFFN_NR_TYPES which indicates that the function is called with a worker_pool's attrs instead of a workqueue's. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 31 ++++++- kernel/workqueue.c | 171 ++++++++++++++++++++++++-------------- 2 files changed, 137 insertions(+), 65 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index bab9fa3453ed..180491ee6706 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -125,6 +125,15 @@ struct rcu_work { struct workqueue_struct *wq; }; +enum wq_affn_scope { + WQ_AFFN_NUMA, /* one pod per NUMA node */ + WQ_AFFN_SYSTEM, /* one pod across the whole system */ + + WQ_AFFN_NR_TYPES, + + WQ_AFFN_DFL = WQ_AFFN_NUMA, +}; + /** * struct workqueue_attrs - A struct for workqueue attributes. * @@ -141,12 +150,26 @@ struct workqueue_attrs { */ cpumask_var_t cpumask; + /* + * Below fields aren't properties of a worker_pool. They only modify how + * :c:func:`apply_workqueue_attrs` select pools and thus don't + * participate in pool hash calculations or equality comparisons. + */ + + /** + * @affn_scope: unbound CPU affinity scope + * + * CPU pods are used to improve execution locality of unbound work + * items. There are multiple pod types, one for each wq_affn_scope, and + * every CPU in the system belongs to one pod in every pod type. CPUs + * that belong to the same pod share the worker pool. For example, + * selecting %WQ_AFFN_NUMA makes the workqueue use a separate worker + * pool for each NUMA node. + */ + enum wq_affn_scope affn_scope; + /** * @ordered: work items must be executed one by one in queueing order - * - * Unlike other fields, ``ordered`` isn't a property of a worker_pool. It - * only modifies how :c:func:`apply_workqueue_attrs` select pools and thus - * doesn't participate in pool hash calculations or equality comparisons. */ bool ordered; }; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 37eab7a1587d..6c4d7b1fdf9a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -326,7 +326,18 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -static cpumask_var_t *wq_pod_cpus; /* possible CPUs of each node */ +/* + * Each pod type describes how CPUs should be grouped for unbound workqueues. + * See the comment above workqueue_attrs->affn_scope. + */ +struct wq_pod_type { + int nr_pods; /* number of pods */ + cpumask_var_t *pod_cpus; /* pod -> cpus */ + int *pod_node; /* pod -> node */ + int *cpu_pod; /* cpu -> pod */ +}; + +static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; /* * Per-cpu work items which run for longer than the following threshold are @@ -344,8 +355,6 @@ module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_online; /* can kworkers be created yet? */ -static bool wq_pod_enabled; /* unbound CPU pod affinity enabled */ - /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ static struct workqueue_attrs *wq_update_pod_attrs_buf; static cpumask_var_t wq_update_pod_cpumask_buf; @@ -1774,10 +1783,6 @@ static int select_numa_node_cpu(int node) { int cpu; - /* No point in doing this if NUMA isn't enabled for workqueues */ - if (!wq_pod_enabled) - return WORK_CPU_UNBOUND; - /* Delay binding to CPU if node is not valid or online */ if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) return WORK_CPU_UNBOUND; @@ -3659,6 +3664,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void) goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); + attrs->affn_scope = WQ_AFFN_DFL; return attrs; fail: free_workqueue_attrs(attrs); @@ -3670,11 +3676,13 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, { to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); + /* - * Unlike hash and equality test, this function doesn't ignore - * ->ordered as it is used for both pool and wq attrs. Instead, - * get_unbound_pool() explicitly clears ->ordered after copying. + * Unlike hash and equality test, copying shouldn't ignore wq-only + * fields as copying is used for both pool and wq attrs. Instead, + * get_unbound_pool() explicitly clears the fields. */ + to->affn_scope = from->affn_scope; to->ordered = from->ordered; } @@ -3684,6 +3692,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, */ static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) { + attrs->affn_scope = WQ_AFFN_NR_TYPES; attrs->ordered = false; } @@ -3723,6 +3732,25 @@ static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs, cpumask_copy(attrs->cpumask, unbound_cpumask); } +/* find wq_pod_type to use for @attrs */ +static const struct wq_pod_type * +wqattrs_pod_type(const struct workqueue_attrs *attrs) +{ + struct wq_pod_type *pt = &wq_pod_types[attrs->affn_scope]; + + if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) && + likely(pt->nr_pods)) + return pt; + + /* + * Before workqueue_init_topology(), only SYSTEM is available which is + * initialized in workqueue_init_early(). + */ + pt = &wq_pod_types[WQ_AFFN_SYSTEM]; + BUG_ON(!pt->nr_pods); + return pt; +} + /** * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize @@ -3924,10 +3952,10 @@ static void put_unbound_pool(struct worker_pool *pool) */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { + struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA]; u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - int pod; - int target_pod = NUMA_NO_NODE; + int pod, node = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); @@ -3939,23 +3967,20 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) } } - /* if cpumask is contained inside a pod, we belong to that pod */ - if (wq_pod_enabled) { - for_each_node(pod) { - if (cpumask_subset(attrs->cpumask, wq_pod_cpus[pod])) { - target_pod = pod; - break; - } + /* If cpumask is contained inside a NUMA pod, that's our NUMA node */ + for (pod = 0; pod < pt->nr_pods; pod++) { + if (cpumask_subset(attrs->cpumask, pt->pod_cpus[pod])) { + node = pt->pod_node[pod]; + break; } } /* nope, create a new one */ - pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_pod); + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node); if (!pool || init_worker_pool(pool) < 0) goto fail; - pool->node = target_pod; - + pool->node = node; copy_workqueue_attrs(pool->attrs, attrs); wqattrs_clear_for_pool(pool->attrs); @@ -4143,7 +4168,7 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, /** * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod * @attrs: the wq_attrs of the default pwq of the target workqueue - * @pod: the target CPU pod + * @cpu: the target CPU * @cpu_going_down: if >= 0, the CPU to consider as offline * @cpumask: outarg, the resulting cpumask * @@ -4157,30 +4182,29 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, * * The caller is responsible for ensuring that the cpumask of @pod stays stable. */ -static void wq_calc_pod_cpumask(const struct workqueue_attrs *attrs, int pod, - int cpu_going_down, cpumask_t *cpumask) +static void wq_calc_pod_cpumask(const struct workqueue_attrs *attrs, int cpu, + int cpu_going_down, cpumask_t *cpumask) { - if (!wq_pod_enabled || attrs->ordered) - goto use_dfl; + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int pod = pt->cpu_pod[cpu]; /* does @pod have any online CPUs @attrs wants? */ - cpumask_and(cpumask, cpumask_of_node(pod), attrs->cpumask); + cpumask_and(cpumask, pt->pod_cpus[pod], attrs->cpumask); + cpumask_and(cpumask, cpumask, cpu_online_mask); if (cpu_going_down >= 0) cpumask_clear_cpu(cpu_going_down, cpumask); - if (cpumask_empty(cpumask)) - goto use_dfl; + if (cpumask_empty(cpumask)) { + cpumask_copy(cpumask, attrs->cpumask); + return; + } /* yeap, return possible CPUs in @pod that @attrs wants */ - cpumask_and(cpumask, attrs->cpumask, wq_pod_cpus[pod]); + cpumask_and(cpumask, attrs->cpumask, pt->pod_cpus[pod]); if (cpumask_empty(cpumask)) pr_warn_once("WARNING: workqueue cpumask: online intersect > " "possible intersect\n"); - return; - -use_dfl: - cpumask_copy(cpumask, attrs->cpumask); } /* install @pwq into @wq's cpu_pwq and return the old pwq */ @@ -4237,6 +4261,10 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, lockdep_assert_held(&wq_pool_mutex); + if (WARN_ON(attrs->affn_scope < 0 || + attrs->affn_scope >= WQ_AFFN_NR_TYPES)) + return ERR_PTR(-EINVAL); + ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(); @@ -4266,8 +4294,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, ctx->dfl_pwq->refcnt++; ctx->pwq_tbl[cpu] = ctx->dfl_pwq; } else { - wq_calc_pod_cpumask(new_attrs, cpu_to_node(cpu), -1, - tmp_attrs->cpumask); + wq_calc_pod_cpumask(new_attrs, cpu, -1, tmp_attrs->cpumask); ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, tmp_attrs); if (!ctx->pwq_tbl[cpu]) goto out_free; @@ -4287,7 +4314,7 @@ out_free: free_workqueue_attrs(tmp_attrs); free_workqueue_attrs(new_attrs); apply_wqattrs_cleanup(ctx); - return NULL; + return ERR_PTR(-ENOMEM); } /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ @@ -4343,8 +4370,8 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, } ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); - if (!ctx) - return -ENOMEM; + if (IS_ERR(ctx)) + return PTR_ERR(ctx); /* the ctx has been prepared successfully, let's commit it */ apply_wqattrs_commit(ctx); @@ -4409,7 +4436,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, static void wq_update_pod(struct workqueue_struct *wq, int cpu, int hotplug_cpu, bool online) { - int pod = cpu_to_node(cpu); int off_cpu = online ? -1 : hotplug_cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; @@ -4417,8 +4443,7 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu, lockdep_assert_held(&wq_pool_mutex); - if (!wq_pod_enabled || !(wq->flags & WQ_UNBOUND) || - wq->unbound_attrs->ordered) + if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered) return; /* @@ -4433,7 +4458,7 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu, wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); /* nothing to do if the target cpumask matches the current pwq */ - wq_calc_pod_cpumask(target_attrs, pod, off_cpu, cpumask); + wq_calc_pod_cpumask(target_attrs, cpu, off_cpu, cpumask); pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), lockdep_is_held(&wq_pool_mutex)); if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) @@ -5460,12 +5485,14 @@ int workqueue_online_cpu(unsigned int cpu) /* update pod affinity of unbound workqueues */ list_for_each_entry(wq, &workqueues, list) { - int tcpu; + struct workqueue_attrs *attrs = wq->unbound_attrs; - for_each_possible_cpu(tcpu) { - if (cpu_to_node(tcpu) == cpu_to_node(cpu)) { + if (attrs) { + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int tcpu; + + for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, true); - } } } @@ -5486,12 +5513,14 @@ int workqueue_offline_cpu(unsigned int cpu) /* update pod affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); list_for_each_entry(wq, &workqueues, list) { - int tcpu; + struct workqueue_attrs *attrs = wq->unbound_attrs; - for_each_possible_cpu(tcpu) { - if (cpu_to_node(tcpu) == cpu_to_node(cpu)) { + if (attrs) { + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int tcpu; + + for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) wq_update_pod(wq, tcpu, cpu, false); - } } } mutex_unlock(&wq_pool_mutex); @@ -5689,8 +5718,8 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) continue; ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); - if (!ctx) { - ret = -ENOMEM; + if (IS_ERR(ctx)) { + ret = PTR_ERR(ctx); break; } @@ -6283,6 +6312,7 @@ static inline void wq_watchdog_init(void) { } */ void __init workqueue_init_early(void) { + struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM]; int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; @@ -6302,6 +6332,22 @@ void __init workqueue_init_early(void) BUG_ON(!alloc_cpumask_var(&wq_update_pod_cpumask_buf, GFP_KERNEL)); + /* initialize WQ_AFFN_SYSTEM pods */ + pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); + pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); + pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); + BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod); + + BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); + + wq_update_pod_attrs_buf = alloc_workqueue_attrs(); + BUG_ON(!wq_update_pod_attrs_buf); + + pt->nr_pods = 1; + cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); + pt->pod_node[0] = NUMA_NO_NODE; + pt->cpu_pod[0] = 0; + /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; @@ -6457,8 +6503,8 @@ void __init workqueue_init(void) */ void __init workqueue_init_topology(void) { + struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA]; struct workqueue_struct *wq; - cpumask_var_t *tbl; int node, cpu; if (num_possible_nodes() <= 1) @@ -6478,20 +6524,23 @@ void __init workqueue_init_topology(void) * available. Build one from cpu_to_node() which should have been * fully initialized by now. */ - tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL); - BUG_ON(!tbl); + pt->pod_cpus = kcalloc(nr_node_ids, sizeof(pt->pod_cpus[0]), GFP_KERNEL); + pt->pod_node = kcalloc(nr_node_ids, sizeof(pt->pod_node[0]), GFP_KERNEL); + pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); + BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod); for_each_node(node) - BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, + BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[node], GFP_KERNEL, node_online(node) ? node : NUMA_NO_NODE)); for_each_possible_cpu(cpu) { node = cpu_to_node(cpu); - cpumask_set_cpu(cpu, tbl[node]); + cpumask_set_cpu(cpu, pt->pod_cpus[node]); + pt->pod_node[node] = node; + pt->cpu_pod[cpu] = node; } - wq_pod_cpus = tbl; - wq_pod_enabled = true; + pt->nr_pods = nr_node_ids; /* * Workqueues allocated earlier would have all CPUs sharing the default From 7f7dc377a3b2bbaa8cf8941587c228eab4bd82ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:24 -1000 Subject: [PATCH 21/31] workqueue: Add tools/workqueue/wq_dump.py which prints out workqueue configuration Lack of visibility has always been a pain point for workqueues. While the recently added wq_monitor.py improved the situation, it's still difficult to understand what worker pools are active in the system, how workqueues map to them and why. The lack of visibility into how workqueues are configured is going to become more noticeable as workqueue improves locality awareness and provides more mechanisms to customize locality related behaviors. Now that the basic framework for more flexible locality support is in place, this is a good time to improve the situation. This patch adds tools/workqueues/wq_dump.py which prints out the topology configuration, worker pools and how workqueues are mapped to pools. Read the command's help message for more details. Signed-off-by: Tejun Heo --- Documentation/core-api/workqueue.rst | 59 ++++++++++ tools/workqueue/wq_dump.py | 166 +++++++++++++++++++++++++++ 2 files changed, 225 insertions(+) create mode 100644 tools/workqueue/wq_dump.py diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index 8e541c5d8fa9..c9e46acd339b 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -347,6 +347,65 @@ Guidelines level of locality in wq operations and work item execution. +Examining Configuration +======================= + +Use tools/workqueue/wq_dump.py to examine unbound CPU affinity +configuration, worker pools and how workqueues map to the pools: :: + + $ tools/workqueue/wq_dump.py + Affinity Scopes + =============== + wq_unbound_cpumask=0000000f + + NUMA + nr_pods 2 + pod_cpus [0]=00000003 [1]=0000000c + pod_node [0]=0 [1]=1 + cpu_pod [0]=0 [1]=0 [2]=1 [3]=1 + + SYSTEM + nr_pods 1 + pod_cpus [0]=0000000f + pod_node [0]=-1 + cpu_pod [0]=0 [1]=0 [2]=0 [3]=0 + + Worker Pools + ============ + pool[00] ref= 1 nice= 0 idle/workers= 4/ 4 cpu= 0 + pool[01] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 0 + pool[02] ref= 1 nice= 0 idle/workers= 4/ 4 cpu= 1 + pool[03] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 1 + pool[04] ref= 1 nice= 0 idle/workers= 4/ 4 cpu= 2 + pool[05] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 2 + pool[06] ref= 1 nice= 0 idle/workers= 3/ 3 cpu= 3 + pool[07] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 3 + pool[08] ref=42 nice= 0 idle/workers= 6/ 6 cpus=0000000f + pool[09] ref=28 nice= 0 idle/workers= 3/ 3 cpus=00000003 + pool[10] ref=28 nice= 0 idle/workers= 17/ 17 cpus=0000000c + pool[11] ref= 1 nice=-20 idle/workers= 1/ 1 cpus=0000000f + pool[12] ref= 2 nice=-20 idle/workers= 1/ 1 cpus=00000003 + pool[13] ref= 2 nice=-20 idle/workers= 1/ 1 cpus=0000000c + + Workqueue CPU -> pool + ===================== + [ workqueue \ CPU 0 1 2 3 dfl] + events percpu 0 2 4 6 + events_highpri percpu 1 3 5 7 + events_long percpu 0 2 4 6 + events_unbound unbound 9 9 10 10 8 + events_freezable percpu 0 2 4 6 + events_power_efficient percpu 0 2 4 6 + events_freezable_power_ percpu 0 2 4 6 + rcu_gp percpu 0 2 4 6 + rcu_par_gp percpu 0 2 4 6 + slub_flushwq percpu 0 2 4 6 + netns ordered 8 8 8 8 8 + ... + +See the command's help message for more info. + + Monitoring ========== diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py new file mode 100644 index 000000000000..ddd0bb4395ea --- /dev/null +++ b/tools/workqueue/wq_dump.py @@ -0,0 +1,166 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2023 Tejun Heo +# Copyright (C) 2023 Meta Platforms, Inc. and affiliates. + +desc = """ +This is a drgn script to show the current workqueue configuration. For more +info on drgn, visit https://github.com/osandov/drgn. + +Affinity Scopes +=============== + +Shows the CPUs that can be used for unbound workqueues and how they will be +grouped by each available affinity type. For each type: + + nr_pods number of CPU pods in the affinity type + pod_cpus CPUs in each pod + pod_node NUMA node for memory allocation for each pod + cpu_pod pod that each CPU is associated to + +Worker Pools +============ + +Lists all worker pools indexed by their ID. For each pool: + + ref number of pool_workqueue's associated with this pool + nice nice value of the worker threads in the pool + idle number of idle workers + workers number of all workers + cpu CPU the pool is associated with (per-cpu pool) + cpus CPUs the workers in the pool can run on (unbound pool) + +Workqueue CPU -> pool +===================== + +Lists all workqueues along with their type and worker pool association. For +each workqueue: + + NAME TYPE POOL_ID... + + NAME name of the workqueue + TYPE percpu, unbound or ordered + POOL_ID worker pool ID associated with each possible CPU +""" + +import sys + +import drgn +from drgn.helpers.linux.list import list_for_each_entry,list_empty +from drgn.helpers.linux.percpu import per_cpu_ptr +from drgn.helpers.linux.cpumask import for_each_cpu,for_each_possible_cpu +from drgn.helpers.linux.idr import idr_for_each + +import argparse +parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) +args = parser.parse_args() + +def err(s): + print(s, file=sys.stderr, flush=True) + sys.exit(1) + +def cpumask_str(cpumask): + output = "" + base = 0 + v = 0 + for cpu in for_each_cpu(cpumask[0]): + while cpu - base >= 32: + output += f'{hex(v)} ' + base += 32 + v = 0 + v |= 1 << (cpu - base) + if v > 0: + output += f'{v:08x}' + return output.strip() + +worker_pool_idr = prog['worker_pool_idr'] +workqueues = prog['workqueues'] +wq_unbound_cpumask = prog['wq_unbound_cpumask'] +wq_pod_types = prog['wq_pod_types'] + +WQ_UNBOUND = prog['WQ_UNBOUND'] +WQ_ORDERED = prog['__WQ_ORDERED'] +WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM'] + +WQ_AFFN_NUMA = prog['WQ_AFFN_NUMA'] +WQ_AFFN_SYSTEM = prog['WQ_AFFN_SYSTEM'] + +print('Affinity Scopes') +print('===============') + +print(f'wq_unbound_cpumask={cpumask_str(wq_unbound_cpumask)}') + +def print_pod_type(pt): + print(f' nr_pods {pt.nr_pods.value_()}') + + print(' pod_cpus', end='') + for pod in range(pt.nr_pods): + print(f' [{pod}]={cpumask_str(pt.pod_cpus[pod])}', end='') + print('') + + print(' pod_node', end='') + for pod in range(pt.nr_pods): + print(f' [{pod}]={pt.pod_node[pod].value_()}', end='') + print('') + + print(f' cpu_pod ', end='') + for cpu in for_each_possible_cpu(prog): + print(f' [{cpu}]={pt.cpu_pod[cpu].value_()}', end='') + print('') + +print('') +print('NUMA') +print_pod_type(wq_pod_types[WQ_AFFN_NUMA]) +print('') +print('SYSTEM') +print_pod_type(wq_pod_types[WQ_AFFN_SYSTEM]) + +print('') +print('Worker Pools') +print('============') + +max_pool_id_len = 0 +max_ref_len = 0 +for pi, pool in idr_for_each(worker_pool_idr): + pool = drgn.Object(prog, 'struct worker_pool', address=pool) + max_pool_id_len = max(max_pool_id_len, len(f'{pi}')) + max_ref_len = max(max_ref_len, len(f'{pool.refcnt.value_()}')) + +for pi, pool in idr_for_each(worker_pool_idr): + pool = drgn.Object(prog, 'struct worker_pool', address=pool) + print(f'pool[{pi:0{max_pool_id_len}}] ref={pool.refcnt.value_():{max_ref_len}} nice={pool.attrs.nice.value_():3} ', end='') + print(f'idle/workers={pool.nr_idle.value_():3}/{pool.nr_workers.value_():3} ', end='') + if pool.cpu >= 0: + print(f'cpu={pool.cpu.value_():3}', end='') + else: + print(f'cpus={cpumask_str(pool.attrs.cpumask)}', end='') + print('') + +print('') +print('Workqueue CPU -> pool') +print('=====================') + +print('[ workqueue \ CPU ', end='') +for cpu in for_each_possible_cpu(prog): + print(f' {cpu:{max_pool_id_len}}', end='') +print(' dfl]') + +for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'): + print(f'{wq.name.string_().decode()[-24:]:24}', end='') + if wq.flags & WQ_UNBOUND: + if wq.flags & WQ_ORDERED: + print(' ordered', end='') + else: + print(' unbound', end='') + else: + print(' percpu ', end='') + + for cpu in for_each_possible_cpu(prog): + pool_id = per_cpu_ptr(wq.cpu_pwq, cpu)[0].pool.id.value_() + field_len = max(len(str(cpu)), max_pool_id_len) + print(f' {pool_id:{field_len}}', end='') + + if wq.flags & WQ_UNBOUND: + print(f' {wq.dfl_pwq.pool.id.value_():{max_pool_id_len}}', end='') + print('') From 025e16845877e80cb169274b330c236056ba553c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:24 -1000 Subject: [PATCH 22/31] workqueue: Modularize wq_pod_type initialization While wq_pod_type[] can now group CPUs in any aribitrary way, WQ_AFFN_NUM init is hard coded into workqueue_init_topology(). This patch modularizes the init path by introducing init_pod_type() which takes a callback to determine whether two CPUs should share a pod as an argument. init_pod_type() first scans the CPU combinations testing for sharing to assign consecutive pod IDs and initialize pod_type->cpu_pod[]. Once ->cpu_pod[] is determined, ->pod_cpus[] and ->pod_node[] are initialized accordingly. WQ_AFFN_NUMA is now initialized by calling init_pod_type() with cpus_share_numa() which tests whether the CPU belongs to the same NUMA node. This patch may change the pod ID assigned to each NUMA node but that shouldn't cause any behavior changes as the NUMA node to use for allocations are tracked separately in pod_type->pod_node[]. This makes adding new affinty types pretty easy. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 84 +++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6c4d7b1fdf9a..a2cc0432abd5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6494,6 +6494,54 @@ void __init workqueue_init(void) wq_watchdog_init(); } +/* + * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to + * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique + * and consecutive pod ID. The rest of @pt is initialized accordingly. + */ +static void __init init_pod_type(struct wq_pod_type *pt, + bool (*cpus_share_pod)(int, int)) +{ + int cur, pre, cpu, pod; + + pt->nr_pods = 0; + + /* init @pt->cpu_pod[] according to @cpus_share_pod() */ + pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); + BUG_ON(!pt->cpu_pod); + + for_each_possible_cpu(cur) { + for_each_possible_cpu(pre) { + if (pre >= cur) { + pt->cpu_pod[cur] = pt->nr_pods++; + break; + } + if (cpus_share_pod(cur, pre)) { + pt->cpu_pod[cur] = pt->cpu_pod[pre]; + break; + } + } + } + + /* init the rest to match @pt->cpu_pod[] */ + pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL); + pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL); + BUG_ON(!pt->pod_cpus || !pt->pod_node); + + for (pod = 0; pod < pt->nr_pods; pod++) + BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL)); + + for_each_possible_cpu(cpu) { + cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]); + pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu); + } +} + +static bool __init cpus_share_numa(int cpu0, int cpu1) +{ + return cpu_to_node(cpu0) == cpu_to_node(cpu1); +} + /** * workqueue_init_topology - initialize CPU pods for unbound workqueues * @@ -6503,45 +6551,13 @@ void __init workqueue_init(void) */ void __init workqueue_init_topology(void) { - struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA]; struct workqueue_struct *wq; - int node, cpu; + int cpu; - if (num_possible_nodes() <= 1) - return; - - for_each_possible_cpu(cpu) { - if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { - pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); - return; - } - } + init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); mutex_lock(&wq_pool_mutex); - /* - * We want masks of possible CPUs of each node which isn't readily - * available. Build one from cpu_to_node() which should have been - * fully initialized by now. - */ - pt->pod_cpus = kcalloc(nr_node_ids, sizeof(pt->pod_cpus[0]), GFP_KERNEL); - pt->pod_node = kcalloc(nr_node_ids, sizeof(pt->pod_node[0]), GFP_KERNEL); - pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); - BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod); - - for_each_node(node) - BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[node], GFP_KERNEL, - node_online(node) ? node : NUMA_NO_NODE)); - - for_each_possible_cpu(cpu) { - node = cpu_to_node(cpu); - cpumask_set_cpu(cpu, pt->pod_cpus[node]); - pt->pod_node[node] = node; - pt->cpu_pod[cpu] = node; - } - - pt->nr_pods = nr_node_ids; - /* * Workqueues allocated earlier would have all CPUs sharing the default * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU From 63c5484e74952f60f5810256bd69814d167b8d22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:24 -1000 Subject: [PATCH 23/31] workqueue: Add multiple affinity scopes and interface to select them Add three more affinity scopes - WQ_AFFN_CPU, SMT and CACHE - and make CACHE the default. The code changes to actually add the additional scopes are trivial. Also add module parameter "workqueue.default_affinity_scope" to override the default scope and "affinity_scope" sysfs file to configure it per workqueue. wq_dump.py and documentations are updated accordingly. This enables significant flexibility in configuring how unbound workqueues behave. If affinity scope is set to "cpu", it'll behave close to a per-cpu workqueue. On the other hand, "system" removes all locality boundaries. Many modern machines have multiple L3 caches often while being mostly uniform in terms of memory access. Thus, workqueue's previous behavior of spreading work items in each NUMA node had negative performance implications from unncessarily crossing L3 boundaries between issue and execution. However, picking a finer grained affinity scope also has a downside in that an issuer in one group can't utilize CPUs in other groups. While dependent on the specifics of workload, there's usually a noticeable penalty in crossing L3 boundaries, so let's default to CACHE. This issue will be further addressed and documented with examples in future patches. Signed-off-by: Tejun Heo --- .../admin-guide/kernel-parameters.txt | 12 ++ Documentation/core-api/workqueue.rst | 63 ++++++++++ include/linux/workqueue.h | 5 +- kernel/workqueue.c | 110 +++++++++++++++++- tools/workqueue/wq_dump.py | 15 ++- 5 files changed, 193 insertions(+), 12 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b89cbc39713..732c5c7e3fa5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7007,6 +7007,18 @@ The default value of this parameter is determined by the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT. + workqueue.default_affinity_scope= + Select the default affinity scope to use for unbound + workqueues. Can be one of "cpu", "smt", "cache", + "numa" and "system". Default is "cache". For more + information, see the Affinity Scopes section in + Documentation/core-api/workqueue.rst. + + This can be updated after boot through the matching + file under /sys/module/workqueue/parameters. + However, the changed default will only apply to + unbound workqueues created afterwards. + workqueue.debug_force_rr_cpu Workqueue used to implicitly guarantee that work items queued without explicit CPU specified are put diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index c9e46acd339b..56af317508c9 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -347,6 +347,51 @@ Guidelines level of locality in wq operations and work item execution. +Affinity Scopes +=============== + +An unbound workqueue groups CPUs according to its affinity scope to improve +cache locality. For example, if a workqueue is using the default affinity +scope of "cache", it will group CPUs according to last level cache +boundaries. A work item queued on the workqueue will be processed by a +worker running on one of the CPUs which share the last level cache with the +issuing CPU. + +Workqueue currently supports the following five affinity scopes. + +``cpu`` + CPUs are not grouped. A work item issued on one CPU is processed by a + worker on the same CPU. This makes unbound workqueues behave as per-cpu + workqueues without concurrency management. + +``smt`` + CPUs are grouped according to SMT boundaries. This usually means that the + logical threads of each physical CPU core are grouped together. + +``cache`` + CPUs are grouped according to cache boundaries. Which specific cache + boundary is used is determined by the arch code. L3 is used in a lot of + cases. This is the default affinity scope. + +``numa`` + CPUs are grouped according to NUMA bounaries. + +``system`` + All CPUs are put in the same group. Workqueue makes no effort to process a + work item on a CPU close to the issuing CPU. + +The default affinity scope can be changed with the module parameter +``workqueue.default_affinity_scope`` and a specific workqueue's affinity +scope can be changed using ``apply_workqueue_attrs()``. + +If ``WQ_SYSFS`` is set, the workqueue will have the following affinity scope +related interface files under its ``/sys/devices/virtual/WQ_NAME/`` +directory. + +``affinity_scope`` + Read to see the current affinity scope. Write to change. + + Examining Configuration ======================= @@ -358,6 +403,24 @@ configuration, worker pools and how workqueues map to the pools: :: =============== wq_unbound_cpumask=0000000f + CPU + nr_pods 4 + pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008 + pod_node [0]=0 [1]=0 [2]=1 [3]=1 + cpu_pod [0]=0 [1]=1 [2]=2 [3]=3 + + SMT + nr_pods 4 + pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008 + pod_node [0]=0 [1]=0 [2]=1 [3]=1 + cpu_pod [0]=0 [1]=1 [2]=2 [3]=3 + + CACHE (default) + nr_pods 2 + pod_cpus [0]=00000003 [1]=0000000c + pod_node [0]=0 [1]=1 + cpu_pod [0]=0 [1]=0 [2]=1 [3]=1 + NUMA nr_pods 2 pod_cpus [0]=00000003 [1]=0000000c diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 180491ee6706..568cfbc24bc0 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -126,12 +126,15 @@ struct rcu_work { }; enum wq_affn_scope { + WQ_AFFN_CPU, /* one pod per CPU */ + WQ_AFFN_SMT, /* one pod poer SMT */ + WQ_AFFN_CACHE, /* one pod per LLC */ WQ_AFFN_NUMA, /* one pod per NUMA node */ WQ_AFFN_SYSTEM, /* one pod across the whole system */ WQ_AFFN_NR_TYPES, - WQ_AFFN_DFL = WQ_AFFN_NUMA, + WQ_AFFN_DFL = WQ_AFFN_CACHE, }; /** diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a2cc0432abd5..8e3a499c3f00 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -338,6 +338,15 @@ struct wq_pod_type { }; static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; +static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_DFL; + +static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { + [WQ_AFFN_CPU] = "cpu", + [WQ_AFFN_SMT] = "smt", + [WQ_AFFN_CACHE] = "cache", + [WQ_AFFN_NUMA] = "numa", + [WQ_AFFN_SYSTEM] = "system", +}; /* * Per-cpu work items which run for longer than the following threshold are @@ -3664,7 +3673,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void) goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); - attrs->affn_scope = WQ_AFFN_DFL; + attrs->affn_scope = wq_affn_dfl; return attrs; fail: free_workqueue_attrs(attrs); @@ -5777,19 +5786,55 @@ out_unlock: return ret; } +static int parse_affn_scope(const char *val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) { + if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i]))) + return i; + } + return -EINVAL; +} + +static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) +{ + int affn; + + affn = parse_affn_scope(val); + if (affn < 0) + return affn; + + wq_affn_dfl = affn; + return 0; +} + +static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp) +{ + return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]); +} + +static const struct kernel_param_ops wq_affn_dfl_ops = { + .set = wq_affn_dfl_set, + .get = wq_affn_dfl_get, +}; + +module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); + #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the * following attributes. * - * per_cpu RO bool : whether the workqueue is per-cpu or unbound - * max_active RW int : maximum number of in-flight work items + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items * * Unbound workqueues have the following extra attributes. * - * nice RW int : nice value of the workers - * cpumask RW mask : bitmask of allowed CPUs for the workers + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) */ struct wq_device { struct workqueue_struct *wq; @@ -5932,9 +5977,47 @@ out_unlock: return ret ?: count; } +static ssize_t wq_affn_scope_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%s\n", + wq_affn_names[wq->unbound_attrs->affn_scope]); + mutex_unlock(&wq->mutex); + + return written; +} + +static ssize_t wq_affn_scope_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int affn, ret = -ENOMEM; + + affn = parse_affn_scope(buf); + if (affn < 0) + return affn; + + apply_wqattrs_lock(); + attrs = wq_sysfs_prep_attrs(wq); + if (attrs) { + attrs->affn_scope = affn; + ret = apply_workqueue_attrs_locked(wq, attrs); + } + apply_wqattrs_unlock(); + free_workqueue_attrs(attrs); + return ret ?: count; +} + static struct device_attribute wq_sysfs_unbound_attrs[] = { __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), __ATTR_NULL, }; @@ -6537,6 +6620,20 @@ static void __init init_pod_type(struct wq_pod_type *pt, } } +static bool __init cpus_dont_share(int cpu0, int cpu1) +{ + return false; +} + +static bool __init cpus_share_smt(int cpu0, int cpu1) +{ +#ifdef CONFIG_SCHED_SMT + return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); +#else + return false; +#endif +} + static bool __init cpus_share_numa(int cpu0, int cpu1) { return cpu_to_node(cpu0) == cpu_to_node(cpu1); @@ -6554,6 +6651,9 @@ void __init workqueue_init_topology(void) struct workqueue_struct *wq; int cpu; + init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share); + init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt); + init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); mutex_lock(&wq_pool_mutex); diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py index ddd0bb4395ea..43ab71a193b8 100644 --- a/tools/workqueue/wq_dump.py +++ b/tools/workqueue/wq_dump.py @@ -78,11 +78,16 @@ worker_pool_idr = prog['worker_pool_idr'] workqueues = prog['workqueues'] wq_unbound_cpumask = prog['wq_unbound_cpumask'] wq_pod_types = prog['wq_pod_types'] +wq_affn_dfl = prog['wq_affn_dfl'] +wq_affn_names = prog['wq_affn_names'] WQ_UNBOUND = prog['WQ_UNBOUND'] WQ_ORDERED = prog['__WQ_ORDERED'] WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM'] +WQ_AFFN_CPU = prog['WQ_AFFN_CPU'] +WQ_AFFN_SMT = prog['WQ_AFFN_SMT'] +WQ_AFFN_CACHE = prog['WQ_AFFN_CACHE'] WQ_AFFN_NUMA = prog['WQ_AFFN_NUMA'] WQ_AFFN_SYSTEM = prog['WQ_AFFN_SYSTEM'] @@ -109,12 +114,10 @@ def print_pod_type(pt): print(f' [{cpu}]={pt.cpu_pod[cpu].value_()}', end='') print('') -print('') -print('NUMA') -print_pod_type(wq_pod_types[WQ_AFFN_NUMA]) -print('') -print('SYSTEM') -print_pod_type(wq_pod_types[WQ_AFFN_SYSTEM]) +for affn in [WQ_AFFN_CPU, WQ_AFFN_SMT, WQ_AFFN_CACHE, WQ_AFFN_NUMA, WQ_AFFN_SYSTEM]: + print('') + print(f'{wq_affn_names[affn].string_().decode().upper()}{" (default)" if affn == wq_affn_dfl else ""}') + print_pod_type(wq_pod_types[affn]) print('') print('Worker Pools') From 873eaca6eaf84b1d1ed5b7259308c6a4fca70fdc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:25 -1000 Subject: [PATCH 24/31] workqueue: Factor out work to worker assignment and collision handling The two work execution paths in worker_thread() and rescuer_thread() use move_linked_works() to claim work items from @pool->worklist. Once claimed, process_schedule_works() is called which invokes process_one_work() on each work item. process_one_work() then uses find_worker_executing_work() to detect and handle collisions - situations where the work item to be executed is still running on another worker. This works fine, but, to improve work execution locality, we want to establish work to worker association earlier and know for sure that the worker is going to excute the work once asssigned, which requires performing collision handling earlier while trying to assign the work item to the worker. This patch introduces assign_work() which assigns a work item to a worker using move_linked_works() and then performs collision handling. As collision handling is handled earlier, process_one_work() no longer needs to worry about them. After the this patch, collision checks for linked work items are skipped, which should be fine as they can't be queued multiple times concurrently. For work items running from rescuers, the timing of collision handling may change but the invariant that the work items go through collision handling before starting execution does not. This patch shouldn't cause noticeable behavior changes, especially given that worker_thread() behavior remains the same. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 80 ++++++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 28 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8e3a499c3f00..8d0416981947 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1025,13 +1025,10 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, * @head: target list to append @work to * @nextp: out parameter for nested worklist walking * - * Schedule linked works starting from @work to @head. Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work. This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). + * Schedule linked works starting from @work to @head. Work series to be + * scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on + * @nextp. * * CONTEXT: * raw_spin_lock_irq(pool->lock). @@ -1060,6 +1057,48 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, *nextp = n; } +/** + * assign_work - assign a work item and its linked work items to a worker + * @work: work to assign + * @worker: worker to assign to + * @nextp: out parameter for nested worklist walking + * + * Assign @work and its linked work items to @worker. If @work is already being + * executed by another worker in the same pool, it'll be punted there. + * + * If @nextp is not NULL, it's updated to point to the next work of the last + * scheduled work. This allows assign_work() to be nested inside + * list_for_each_entry_safe(). + * + * Returns %true if @work was successfully assigned to @worker. %false if @work + * was punted to another worker already executing it. + */ +static bool assign_work(struct work_struct *work, struct worker *worker, + struct work_struct **nextp) +{ + struct worker_pool *pool = worker->pool; + struct worker *collision; + + lockdep_assert_held(&pool->lock); + + /* + * A single work shouldn't be executed concurrently by multiple workers. + * __queue_work() ensures that @work doesn't jump to a different pool + * while still running in the previous pool. Here, we should ensure that + * @work is not executed concurrently by multiple workers from the same + * pool. Check whether anyone is already processing the work. If so, + * defer the work to the currently executing one. + */ + collision = find_worker_executing_work(pool, work); + if (unlikely(collision)) { + move_linked_works(work, &collision->scheduled, nextp); + return false; + } + + move_linked_works(work, &worker->scheduled, nextp); + return true; +} + /** * wake_up_worker - wake up an idle worker * @pool: worker pool to wake worker from @@ -2462,7 +2501,6 @@ __acquires(&pool->lock) struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; unsigned long work_data; - struct worker *collision; #ifdef CONFIG_LOCKDEP /* * It is permissible to free the struct work_struct from @@ -2479,18 +2517,6 @@ __acquires(&pool->lock) WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); - /* - * A single work shouldn't be executed concurrently by - * multiple workers on a single cpu. Check whether anyone is - * already processing the work. If so, defer the work to the - * currently executing one. - */ - collision = find_worker_executing_work(pool, work); - if (unlikely(collision)) { - move_linked_works(work, &collision->scheduled, NULL); - return; - } - /* claim and dequeue */ debug_work_deactivate(work); hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); @@ -2717,8 +2743,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); - move_linked_works(work, &worker->scheduled, NULL); - process_scheduled_works(worker); + if (assign_work(work, worker, NULL)) + process_scheduled_works(worker); } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP); @@ -2762,7 +2788,6 @@ static int rescuer_thread(void *__rescuer) { struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; - struct list_head *scheduled = &rescuer->scheduled; bool should_stop; set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2807,15 +2832,14 @@ repeat: * Slurp in all works issued via this workqueue and * process'em. */ - WARN_ON_ONCE(!list_empty(scheduled)); + WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) { - if (get_work_pwq(work) == pwq) { - move_linked_works(work, scheduled, &n); + if (get_work_pwq(work) == pwq && + assign_work(work, rescuer, &n)) pwq->stats[PWQ_STAT_RESCUED]++; - } } - if (!list_empty(scheduled)) { + if (!list_empty(&rescuer->scheduled)) { process_scheduled_works(rescuer); /* From 0219a3528d72143d8d2c4c793b61541d03518b59 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:25 -1000 Subject: [PATCH 25/31] workqueue: Factor out need_more_worker() check and worker wake-up Checking need_more_worker() and calling wake_up_worker() is a repeated pattern. Let's add kick_pool(), which checks need_more_worker() and open-code wake_up_worker(), and replace wake_up_worker() uses. The following conversions aren't one-to-one: * __queue_work() was using __need_more_work() because it knows that pool->worklist isn't empty. Switching to kick_pool() adds an extra list_empty() test. * create_worker() always needs to wake up the newly minted worker whether there's more work to do or not to avoid triggering hung task check on the new task. Keep the current wake_up_process() and still add kick_pool(). This may lead to an extra wakeup which isn't harmful. * pwq_adjust_max_active() was explicitly checking whether it needs to wake up a worker or not to avoid spurious wakeups. As kick_pool() only wakes up a worker when necessary, this explicit check is no longer necessary and dropped. * unbind_workers() now calls kick_pool() instead of wake_up_worker() adding a need_more_worker() test. This avoids spurious wakeups and shouldn't break anything. wake_up_worker() is dropped as kick_pool() replaces all its users. After this patch, all paths that wakes up a non-rescuer worker to initiate work item execution use kick_pool(). This will enable future changes to improve locality. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 87 ++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8d0416981947..e941fa052a2b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -815,11 +815,6 @@ static bool work_is_canceling(struct work_struct *work) * they're being called with pool->lock held. */ -static bool __need_more_worker(struct worker_pool *pool) -{ - return !pool->nr_running; -} - /* * Need to wake up a worker? Called from anything but currently * running workers. @@ -830,7 +825,7 @@ static bool __need_more_worker(struct worker_pool *pool) */ static bool need_more_worker(struct worker_pool *pool) { - return !list_empty(&pool->worklist) && __need_more_worker(pool); + return !list_empty(&pool->worklist) && !pool->nr_running; } /* Can I start working? Called from busy but !running workers. */ @@ -1100,20 +1095,23 @@ static bool assign_work(struct work_struct *work, struct worker *worker, } /** - * wake_up_worker - wake up an idle worker - * @pool: worker pool to wake worker from + * kick_pool - wake up an idle worker if necessary + * @pool: pool to kick * - * Wake up the first idle worker of @pool. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). + * @pool may have pending work items. Wake up worker if necessary. Returns + * whether a worker was woken up. */ -static void wake_up_worker(struct worker_pool *pool) +static bool kick_pool(struct worker_pool *pool) { struct worker *worker = first_idle_worker(pool); - if (likely(worker)) - wake_up_process(worker->task); + lockdep_assert_held(&pool->lock); + + if (!need_more_worker(pool) || !worker) + return false; + + wake_up_process(worker->task); + return true; } #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT @@ -1281,10 +1279,9 @@ void wq_worker_sleeping(struct task_struct *task) } pool->nr_running--; - if (need_more_worker(pool)) { + if (kick_pool(pool)) worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; - wake_up_worker(pool); - } + raw_spin_unlock_irq(&pool->lock); } @@ -1332,10 +1329,8 @@ void wq_worker_tick(struct task_struct *task) wq_cpu_intensive_report(worker->current_func); pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; - if (need_more_worker(pool)) { + if (kick_pool(pool)) pwq->stats[PWQ_STAT_CM_WAKEUP]++; - wake_up_worker(pool); - } raw_spin_unlock(&pool->lock); } @@ -1773,9 +1768,7 @@ retry: trace_workqueue_activate_work(work); pwq->nr_active++; insert_work(pwq, work, &pool->worklist, work_flags); - - if (__need_more_worker(pool)) - wake_up_worker(pool); + kick_pool(pool); } else { work_flags |= WORK_STRUCT_INACTIVE; insert_work(pwq, work, &pwq->inactive_works, work_flags); @@ -2181,9 +2174,18 @@ static struct worker *create_worker(struct worker_pool *pool) /* start the newly created worker */ raw_spin_lock_irq(&pool->lock); + worker->pool->nr_workers++; worker_enter_idle(worker); + kick_pool(pool); + + /* + * @worker is waiting on a completion in kthread() and will trigger hung + * check if not woken up soon. As kick_pool() might not have waken it + * up, wake it up explicitly once more. + */ wake_up_process(worker->task); + raw_spin_unlock_irq(&pool->lock); return worker; @@ -2545,14 +2547,12 @@ __acquires(&pool->lock) worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* - * Wake up another worker if necessary. The condition is always - * false for normal per-cpu workers since nr_running would always - * be >= 1 at this point. This is used to chain execution of the - * pending work items for WORKER_NOT_RUNNING workers such as the - * UNBOUND and CPU_INTENSIVE ones. + * Kick @pool if necessary. It's always noop for per-cpu worker pools + * since nr_running would always be >= 1 at this point. This is used to + * chain execution of the pending work items for WORKER_NOT_RUNNING + * workers such as the UNBOUND and CPU_INTENSIVE ones. */ - if (need_more_worker(pool)) - wake_up_worker(pool); + kick_pool(pool); /* * Record the last pool and clear PENDING which should be the last @@ -2872,12 +2872,10 @@ repeat: put_pwq(pwq); /* - * Leave this pool. If need_more_worker() is %true, notify a - * regular worker; otherwise, we end up with 0 concurrency - * and stalling the execution. + * Leave this pool. Notify regular workers; otherwise, we end up + * with 0 concurrency and stalling the execution. */ - if (need_more_worker(pool)) - wake_up_worker(pool); + kick_pool(pool); raw_spin_unlock_irq(&pool->lock); @@ -4111,24 +4109,13 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) * is updated and visible. */ if (!freezable || !workqueue_freezing) { - bool kick = false; - pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->inactive_works) && - pwq->nr_active < pwq->max_active) { + pwq->nr_active < pwq->max_active) pwq_activate_first_inactive(pwq); - kick = true; - } - /* - * Need to kick a worker after thawed or an unbound wq's - * max_active is bumped. In realtime scenarios, always kicking a - * worker will cause interference on the isolated cpu cores, so - * let's kick iff work items were activated. - */ - if (kick) - wake_up_worker(pwq->pool); + kick_pool(pwq->pool); } else { pwq->max_active = 0; } @@ -5389,7 +5376,7 @@ static void unbind_workers(int cpu) * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - wake_up_worker(pool); + kick_pool(pool); raw_spin_unlock_irq(&pool->lock); From 9546b29e4a6ad6ed7924dd7980975c8e675740a3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:25 -1000 Subject: [PATCH 26/31] workqueue: Add workqueue_attrs->__pod_cpumask workqueue_attrs has two uses: * to specify the required unouned workqueue properties by users * to match worker_pool's properties to workqueues by core code For example, if the user wants to restrict a workqueue to run only CPUs 0 and 2, and the two CPUs are on different affinity scopes, the workqueue's attrs->cpumask would contains CPUs 0 and 2, and the workqueue would be associated with two worker_pools, one with attrs->cpumask containing just CPU 0 and the other CPU 2. Workqueue wants to support non-strict affinity scopes where work items are started in their matching affinity scopes but the scheduler is free to migrate them outside the starting scopes, which can enable utilizing the whole machine while maintaining most of the locality benefits from affinity scopes. To enable that, worker_pools need to distinguish the strict affinity that it has to follow (because that's the restriction coming from the user) and the soft affinity that it wants to apply when dispatching work items. Note that two worker_pools with different soft dispatching requirements have to be separate; otherwise, for example, we'd be ping-ponging worker threads across NUMA boundaries constantly. This patch adds workqueue_attrs->__pod_cpumask. The new field is double underscored as it's only used internally to distinguish worker_pools. A worker_pool's ->cpumask is now always the same as the online subset of allowed CPUs of the associated workqueues, and ->__pod_cpumask is the pod's subset of that ->cpumask. Going back to the example above, both worker_pools would have ->cpumask containing both CPUs 0 and 2 but one's ->__pod_cpumask would contain 0 while the other's 2. * pool_allowed_cpus() is added. It returns the worker_pool's strict cpumask that the pool's workers must stay within. This is currently always ->__pod_cpumask as all boundaries are still strict. * As a workqueue_attrs can now track both the associated workqueues' cpumask and its per-pod subset, wq_calc_pod_cpumask() no longer needs an external out-argument. Drop @cpumask and instead store the result in ->__pod_cpumask. * The above also simplifies apply_wqattrs_prepare() as the same workqueue_attrs can be used to create all pods associated with a workqueue. tmp_attrs is dropped. * wq_update_pod() is updated to use wqattrs_equal() to test whether a pwq update is needed instead of only comparing ->cpumask so that ->__pod_cpumask is compared too. It can directly compare ->__pod_cpumaks but the code is easier to understand and more robust this way. The only user-visible behavior change is that two workqueues with different cpumasks no longer can share worker_pools even when their pod subsets coincide. Going back to the example, let's say there's another workqueue with cpumask 0, 2, 3, where 2 and 3 are in the same pod. It would be mapped to two worker_pools - one with CPU 0, the other with 2 and 3. The former has the same cpumask as the first pod of the earlier example and would have shared the same worker_pool but that's no longer the case after this patch. The worker_pools would have the same ->__pod_cpumask but their ->cpumask's wouldn't match. While this is necessary to support non-strict affinity scopes, there can be further optimizations to maintain sharing among strict affinity scopes. However, non-strict affinity scopes are going to be preferable for most use cases and we don't see very diverse mixture of unbound workqueue cpumasks anyway, so the additional overhead doesn't seem to justify the extra complexity. v2: - wq_update_pod() was incorrectly comparing target_attrs->__pod_cpumask to pool->attrs->cpumask instead of its ->__pod_cpumask. Fix it by using wqattrs_equal() for comparison instead. - Per-cpu worker pools weren't initializing ->__pod_cpumask which caused a subtle problem later on. Set it to cpumask_of(cpu) like ->cpumask. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 16 +++++++++ kernel/workqueue.c | 74 +++++++++++++++++++-------------------- 2 files changed, 53 insertions(+), 37 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 568cfbc24bc0..fe53976e088e 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -150,9 +150,25 @@ struct workqueue_attrs { /** * @cpumask: allowed CPUs + * + * Work items in this workqueue are affine to these CPUs and not allowed + * to execute on other CPUs. A pool serving a workqueue must have the + * same @cpumask. */ cpumask_var_t cpumask; + /** + * @__pod_cpumask: internal attribute used to create per-pod pools + * + * Internal use only. + * + * Per-pod unbound worker pools are used to improve locality. Always a + * subset of ->cpumask. A workqueue can be associated with multiple + * worker pools with disjoint @__pod_cpumask's. Whether the enforcement + * of a pool's @__pod_cpumask is strict depends on @affn_strict. + */ + cpumask_var_t __pod_cpumask; + /* * Below fields aren't properties of a worker_pool. They only modify how * :c:func:`apply_workqueue_attrs` select pools and thus don't diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e941fa052a2b..e61b4291bec8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -366,7 +366,6 @@ static bool wq_online; /* can kworkers be created yet? */ /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ static struct workqueue_attrs *wq_update_pod_attrs_buf; -static cpumask_var_t wq_update_pod_cpumask_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ @@ -2050,6 +2049,11 @@ static struct worker *alloc_worker(int node) return worker; } +static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) +{ + return pool->attrs->__pod_cpumask; +} + /** * worker_attach_to_pool() - attach a worker to a pool * @worker: worker to be attached @@ -2075,7 +2079,7 @@ static void worker_attach_to_pool(struct worker *worker, kthread_set_per_cpu(worker->task, pool->cpu); if (worker->rescue_wq) - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)); list_add_tail(&worker->node, &pool->workers); worker->pool = pool; @@ -2167,7 +2171,7 @@ static struct worker *create_worker(struct worker_pool *pool) } set_user_nice(worker->task, pool->attrs->nice); - kthread_bind_mask(worker->task, pool->attrs->cpumask); + kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); @@ -3672,6 +3676,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); + free_cpumask_var(attrs->__pod_cpumask); kfree(attrs); } } @@ -3693,6 +3698,8 @@ struct workqueue_attrs *alloc_workqueue_attrs(void) goto fail; if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) goto fail; + if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL)) + goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); attrs->affn_scope = wq_affn_dfl; @@ -3707,6 +3714,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, { to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); + cpumask_copy(to->__pod_cpumask, from->__pod_cpumask); /* * Unlike hash and equality test, copying shouldn't ignore wq-only @@ -3735,6 +3743,8 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) hash = jhash_1word(attrs->nice, hash); hash = jhash(cpumask_bits(attrs->cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash(cpumask_bits(attrs->__pod_cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); return hash; } @@ -3746,6 +3756,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, return false; if (!cpumask_equal(a->cpumask, b->cpumask)) return false; + if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) + return false; return true; } @@ -3998,9 +4010,9 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) } } - /* If cpumask is contained inside a NUMA pod, that's our NUMA node */ + /* If __pod_cpumask is contained inside a NUMA pod, that's our node */ for (pod = 0; pod < pt->nr_pods; pod++) { - if (cpumask_subset(attrs->cpumask, pt->pod_cpus[pod])) { + if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) { node = pt->pod_node[pod]; break; } @@ -4190,11 +4202,10 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, * @attrs: the wq_attrs of the default pwq of the target workqueue * @cpu: the target CPU * @cpu_going_down: if >= 0, the CPU to consider as offline - * @cpumask: outarg, the resulting cpumask * * Calculate the cpumask a workqueue with @attrs should use on @pod. If * @cpu_going_down is >= 0, that cpu is considered offline during calculation. - * The result is stored in @cpumask. + * The result is stored in @attrs->__pod_cpumask. * * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled * and @pod has online CPUs requested by @attrs, the returned cpumask is the @@ -4202,27 +4213,27 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, * * The caller is responsible for ensuring that the cpumask of @pod stays stable. */ -static void wq_calc_pod_cpumask(const struct workqueue_attrs *attrs, int cpu, - int cpu_going_down, cpumask_t *cpumask) +static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu, + int cpu_going_down) { const struct wq_pod_type *pt = wqattrs_pod_type(attrs); int pod = pt->cpu_pod[cpu]; /* does @pod have any online CPUs @attrs wants? */ - cpumask_and(cpumask, pt->pod_cpus[pod], attrs->cpumask); - cpumask_and(cpumask, cpumask, cpu_online_mask); + cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask); + cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask); if (cpu_going_down >= 0) - cpumask_clear_cpu(cpu_going_down, cpumask); + cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask); - if (cpumask_empty(cpumask)) { - cpumask_copy(cpumask, attrs->cpumask); + if (cpumask_empty(attrs->__pod_cpumask)) { + cpumask_copy(attrs->__pod_cpumask, attrs->cpumask); return; } /* yeap, return possible CPUs in @pod that @attrs wants */ - cpumask_and(cpumask, attrs->cpumask, pt->pod_cpus[pod]); + cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]); - if (cpumask_empty(cpumask)) + if (cpumask_empty(attrs->__pod_cpumask)) pr_warn_once("WARNING: workqueue cpumask: online intersect > " "possible intersect\n"); } @@ -4276,7 +4287,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, const cpumask_var_t unbound_cpumask) { struct apply_wqattrs_ctx *ctx; - struct workqueue_attrs *new_attrs, *tmp_attrs; + struct workqueue_attrs *new_attrs; int cpu; lockdep_assert_held(&wq_pool_mutex); @@ -4288,8 +4299,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(); - tmp_attrs = alloc_workqueue_attrs(); - if (!ctx || !new_attrs || !tmp_attrs) + if (!ctx || !new_attrs) goto out_free; /* @@ -4299,23 +4309,18 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, */ copy_workqueue_attrs(new_attrs, attrs); wqattrs_actualize_cpumask(new_attrs, unbound_cpumask); + cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); if (!ctx->dfl_pwq) goto out_free; - /* - * We may create multiple pwqs with differing cpumasks. Make a copy of - * @new_attrs which will be modified and used to obtain pools. - */ - copy_workqueue_attrs(tmp_attrs, new_attrs); - for_each_possible_cpu(cpu) { if (new_attrs->ordered) { ctx->dfl_pwq->refcnt++; ctx->pwq_tbl[cpu] = ctx->dfl_pwq; } else { - wq_calc_pod_cpumask(new_attrs, cpu, -1, tmp_attrs->cpumask); - ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, tmp_attrs); + wq_calc_pod_cpumask(new_attrs, cpu, -1); + ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs); if (!ctx->pwq_tbl[cpu]) goto out_free; } @@ -4324,14 +4329,13 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, /* save the user configured attrs and sanitize it. */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->attrs = new_attrs; ctx->wq = wq; - free_workqueue_attrs(tmp_attrs); return ctx; out_free: - free_workqueue_attrs(tmp_attrs); free_workqueue_attrs(new_attrs); apply_wqattrs_cleanup(ctx); return ERR_PTR(-ENOMEM); @@ -4459,7 +4463,6 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu, int off_cpu = online ? -1 : hotplug_cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; - cpumask_t *cpumask; lockdep_assert_held(&wq_pool_mutex); @@ -4472,20 +4475,18 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu, * CPU hotplug exclusion. */ target_attrs = wq_update_pod_attrs_buf; - cpumask = wq_update_pod_cpumask_buf; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); /* nothing to do if the target cpumask matches the current pwq */ - wq_calc_pod_cpumask(target_attrs, cpu, off_cpu, cpumask); + wq_calc_pod_cpumask(target_attrs, cpu, off_cpu); pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), lockdep_is_held(&wq_pool_mutex)); - if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) + if (wqattrs_equal(target_attrs, pwq->pool->attrs)) return; /* create a new pwq */ - cpumask_copy(target_attrs->cpumask, cpumask); pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n", @@ -5409,7 +5410,7 @@ static void rebind_workers(struct worker_pool *pool) for_each_pool_worker(worker, pool) { kthread_set_per_cpu(worker->task, pool->cpu); WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, - pool->attrs->cpumask) < 0); + pool_allowed_cpus(pool)) < 0); } raw_spin_lock_irq(&pool->lock); @@ -6424,8 +6425,6 @@ void __init workqueue_init_early(void) wq_update_pod_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_pod_attrs_buf); - BUG_ON(!alloc_cpumask_var(&wq_update_pod_cpumask_buf, GFP_KERNEL)); - /* initialize WQ_AFFN_SYSTEM pods */ pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); @@ -6451,6 +6450,7 @@ void __init workqueue_init_early(void) BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); + cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; pool->node = cpu_to_node(cpu); From 8639ecebc9b1796d7074751a350462f5e1c61cd4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:25 -1000 Subject: [PATCH 27/31] workqueue: Implement non-strict affinity scope for unbound workqueues An unbound workqueue can be served by multiple worker_pools to improve locality. The segmentation is achieved by grouping CPUs into pods. By default, the cache boundaries according to cpus_share_cache() define the CPUs are grouped. Let's a workqueue is allowed to run on all CPUs and the system has two L3 caches. The workqueue would be mapped to two worker_pools each serving one L3 cache domains. While this improves locality, because the pod boundaries are strict, it limits the total bandwidth a given issuer can consume. For example, let's say there is a thread pinned to a CPU issuing enough work items to saturate the whole machine. With the machine segmented into two pods, no matter how many work items it issues, it can only use half of the CPUs on the system. While this limitation has existed for a very long time, it wasn't very pronounced because the affinity grouping used to be always by NUMA nodes. With cache boundaries as the default and support for even finer grained scopes (smt and cpu), it is now an a lot more pressing problem. This patch implements non-strict affinity scope where the pod boundaries aren't enforced strictly. Going back to the previous example, the workqueue would still be mapped to two worker_pools; however, the affinity enforcement would be soft. The workers in both pools would have their cpus_allowed set to the whole machine thus allowing the scheduler to migrate them anywhere on the machine. However, whenever an idle worker is woken up, the workqueue code asks the scheduler to bring back the task within the pod if the worker is outside. ie. work items start executing within its affinity scope but can be migrated outside as the scheduler sees fit. This removes the hard cap on utilization while maintaining the benefits of affinity scopes. After the earlier ->__pod_cpumask changes, the implementation is pretty simple. When non-strict which is the new default: * pool_allowed_cpus() returns @pool->attrs->cpumask instead of ->__pod_cpumask so that the workers are allowed to run on any CPU that the associated workqueues allow. * If the idle worker task's ->wake_cpu is outside the pod, kick_pool() sets the field to a CPU within the pod. This would be the first use of task_struct->wake_cpu outside scheduler proper, so it isn't clear whether this would be acceptable. However, other methods of migrating tasks are significantly more expensive and are likely prohibitively so if we want to do this on every work item. This needs discussion with scheduler folks. There is also a race window where setting ->wake_cpu wouldn't be effective as the target task is still on CPU. However, the window is pretty small and this being a best-effort optimization, it doesn't seem to warrant more complexity at the moment. While the non-strict cache affinity scopes seem to be the best option, the performance picture interacts with the affinity scope and is a bit complicated to fully discuss in this patch, so the behavior is made easily selectable through wqattrs and sysfs and the next patch will add documentation to discuss performance implications. v2: pool->attrs->affn_strict is set to true for per-cpu worker_pools. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Cc: Linus Torvalds --- Documentation/core-api/workqueue.rst | 30 ++++++++--- include/linux/workqueue.h | 11 +++++ kernel/workqueue.c | 74 +++++++++++++++++++++++++++- tools/workqueue/wq_dump.py | 16 ++++-- tools/workqueue/wq_monitor.py | 21 +++++--- 5 files changed, 132 insertions(+), 20 deletions(-) diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index 56af317508c9..c73a6df6a118 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -353,9 +353,10 @@ Affinity Scopes An unbound workqueue groups CPUs according to its affinity scope to improve cache locality. For example, if a workqueue is using the default affinity scope of "cache", it will group CPUs according to last level cache -boundaries. A work item queued on the workqueue will be processed by a -worker running on one of the CPUs which share the last level cache with the -issuing CPU. +boundaries. A work item queued on the workqueue will be assigned to a worker +on one of the CPUs which share the last level cache with the issuing CPU. +Once started, the worker may or may not be allowed to move outside the scope +depending on the ``affinity_strict`` setting of the scope. Workqueue currently supports the following five affinity scopes. @@ -391,6 +392,21 @@ directory. ``affinity_scope`` Read to see the current affinity scope. Write to change. +``affinity_strict`` + 0 by default indicating that affinity scopes are not strict. When a work + item starts execution, workqueue makes a best-effort attempt to ensure + that the worker is inside its affinity scope, which is called + repatriation. Once started, the scheduler is free to move the worker + anywhere in the system as it sees fit. This enables benefiting from scope + locality while still being able to utilize other CPUs if necessary and + available. + + If set to 1, all workers of the scope are guaranteed always to be in the + scope. This may be useful when crossing affinity scopes has other + implications, for example, in terms of power consumption or workload + isolation. Strict NUMA scope can also be used to match the workqueue + behavior of older kernels. + Examining Configuration ======================= @@ -475,21 +491,21 @@ Monitoring Use tools/workqueue/wq_monitor.py to monitor workqueue operations: :: $ tools/workqueue/wq_monitor.py events - total infl CPUtime CPUhog CMwake mayday rescued + total infl CPUtime CPUhog CMW/RPR mayday rescued events 18545 0 6.1 0 5 - - events_highpri 8 0 0.0 0 0 - - events_long 3 0 0.0 0 0 - - - events_unbound 38306 0 0.1 - - - - + events_unbound 38306 0 0.1 - 7 - - events_freezable 0 0 0.0 0 0 - - events_power_efficient 29598 0 0.2 0 0 - - events_freezable_power_ 10 0 0.0 0 0 - - sock_diag_events 0 0 0.0 0 0 - - - total infl CPUtime CPUhog CMwake mayday rescued + total infl CPUtime CPUhog CMW/RPR mayday rescued events 18548 0 6.1 0 5 - - events_highpri 8 0 0.0 0 0 - - events_long 3 0 0.0 0 0 - - - events_unbound 38322 0 0.1 - - - - + events_unbound 38322 0 0.1 - 7 - - events_freezable 0 0 0.0 0 0 - - events_power_efficient 29603 0 0.2 0 0 - - events_freezable_power_ 10 0 0.0 0 0 - - diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fe53976e088e..0c1cad38f9db 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -169,6 +169,17 @@ struct workqueue_attrs { */ cpumask_var_t __pod_cpumask; + /** + * @affn_strict: affinity scope is strict + * + * If clear, workqueue will make a best-effort attempt at starting the + * worker inside @__pod_cpumask but the scheduler is free to migrate it + * outside. + * + * If set, workers are only allowed to run inside @__pod_cpumask. + */ + bool affn_strict; + /* * Below fields aren't properties of a worker_pool. They only modify how * :c:func:`apply_workqueue_attrs` select pools and thus don't diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e61b4291bec8..6f6f4f37ceb3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -211,6 +211,7 @@ enum pool_workqueue_stats { PWQ_STAT_CPU_TIME, /* total CPU time consumed */ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ + PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */ PWQ_STAT_MAYDAY, /* maydays to rescuer */ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ @@ -1103,13 +1104,41 @@ static bool assign_work(struct work_struct *work, struct worker *worker, static bool kick_pool(struct worker_pool *pool) { struct worker *worker = first_idle_worker(pool); + struct task_struct *p; lockdep_assert_held(&pool->lock); if (!need_more_worker(pool) || !worker) return false; - wake_up_process(worker->task); + p = worker->task; + +#ifdef CONFIG_SMP + /* + * Idle @worker is about to execute @work and waking up provides an + * opportunity to migrate @worker at a lower cost by setting the task's + * wake_cpu field. Let's see if we want to move @worker to improve + * execution locality. + * + * We're waking the worker that went idle the latest and there's some + * chance that @worker is marked idle but hasn't gone off CPU yet. If + * so, setting the wake_cpu won't do anything. As this is a best-effort + * optimization and the race window is narrow, let's leave as-is for + * now. If this becomes pronounced, we can skip over workers which are + * still on cpu when picking an idle worker. + * + * If @pool has non-strict affinity, @worker might have ended up outside + * its affinity scope. Repatriate. + */ + if (!pool->attrs->affn_strict && + !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { + struct work_struct *work = list_first_entry(&pool->worklist, + struct work_struct, entry); + p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask); + get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; + } +#endif + wake_up_process(p); return true; } @@ -2051,7 +2080,10 @@ static struct worker *alloc_worker(int node) static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) { - return pool->attrs->__pod_cpumask; + if (pool->cpu < 0 && pool->attrs->affn_strict) + return pool->attrs->__pod_cpumask; + else + return pool->attrs->cpumask; } /** @@ -3715,6 +3747,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); cpumask_copy(to->__pod_cpumask, from->__pod_cpumask); + to->affn_strict = from->affn_strict; /* * Unlike hash and equality test, copying shouldn't ignore wq-only @@ -3745,6 +3778,7 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); hash = jhash(cpumask_bits(attrs->__pod_cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash_1word(attrs->affn_strict, hash); return hash; } @@ -3758,6 +3792,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, return false; if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) return false; + if (a->affn_strict != b->affn_strict) + return false; return true; } @@ -5847,6 +5883,7 @@ module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); * nice RW int : nice value of the workers * cpumask RW mask : bitmask of allowed CPUs for the workers * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) + * affinity_strict RW bool : worker CPU affinity is strict */ struct wq_device { struct workqueue_struct *wq; @@ -6026,10 +6063,42 @@ static ssize_t wq_affn_scope_store(struct device *dev, return ret ?: count; } +static ssize_t wq_affinity_strict_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", + wq->unbound_attrs->affn_strict); +} + +static ssize_t wq_affinity_strict_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret = -ENOMEM; + + if (sscanf(buf, "%d", &v) != 1) + return -EINVAL; + + apply_wqattrs_lock(); + attrs = wq_sysfs_prep_attrs(wq); + if (attrs) { + attrs->affn_strict = (bool)v; + ret = apply_workqueue_attrs_locked(wq, attrs); + } + apply_wqattrs_unlock(); + free_workqueue_attrs(attrs); + return ret ?: count; +} + static struct device_attribute wq_sysfs_unbound_attrs[] = { __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), + __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store), __ATTR_NULL, }; @@ -6452,6 +6521,7 @@ void __init workqueue_init_early(void) cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; + pool->attrs->affn_strict = true; pool->node = cpu_to_node(cpu); /* alloc pool ID */ diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py index 43ab71a193b8..d0df5833f2c1 100644 --- a/tools/workqueue/wq_dump.py +++ b/tools/workqueue/wq_dump.py @@ -36,10 +36,11 @@ Workqueue CPU -> pool Lists all workqueues along with their type and worker pool association. For each workqueue: - NAME TYPE POOL_ID... + NAME TYPE[,FLAGS] POOL_ID... NAME name of the workqueue TYPE percpu, unbound or ordered + FLAGS S: strict affinity scope POOL_ID worker pool ID associated with each possible CPU """ @@ -138,13 +139,16 @@ for pi, pool in idr_for_each(worker_pool_idr): print(f'cpu={pool.cpu.value_():3}', end='') else: print(f'cpus={cpumask_str(pool.attrs.cpumask)}', end='') + print(f' pod_cpus={cpumask_str(pool.attrs.__pod_cpumask)}', end='') + if pool.attrs.affn_strict: + print(' strict', end='') print('') print('') print('Workqueue CPU -> pool') print('=====================') -print('[ workqueue \ CPU ', end='') +print('[ workqueue \ type CPU', end='') for cpu in for_each_possible_cpu(prog): print(f' {cpu:{max_pool_id_len}}', end='') print(' dfl]') @@ -153,11 +157,15 @@ for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_( print(f'{wq.name.string_().decode()[-24:]:24}', end='') if wq.flags & WQ_UNBOUND: if wq.flags & WQ_ORDERED: - print(' ordered', end='') + print(' ordered ', end='') else: print(' unbound', end='') + if wq.unbound_attrs.affn_strict: + print(',S ', end='') + else: + print(' ', end='') else: - print(' percpu ', end='') + print(' percpu ', end='') for cpu in for_each_possible_cpu(prog): pool_id = per_cpu_ptr(wq.cpu_pwq, cpu)[0].pool.id.value_() diff --git a/tools/workqueue/wq_monitor.py b/tools/workqueue/wq_monitor.py index 6e258d123e8c..a8856a9c45dc 100644 --- a/tools/workqueue/wq_monitor.py +++ b/tools/workqueue/wq_monitor.py @@ -20,8 +20,11 @@ https://github.com/osandov/drgn. and got excluded from concurrency management to avoid stalling other work items. - CMwake The number of concurrency-management wake-ups while executing a - work item of the workqueue. + CMW/RPR For per-cpu workqueues, the number of concurrency-management + wake-ups while executing a work item of the workqueue. For + unbound workqueues, the number of times a worker was repatriated + to its affinity scope after being migrated to an off-scope CPU by + the scheduler. mayday The number of times the rescuer was requested while waiting for new worker creation. @@ -65,6 +68,7 @@ PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed exec PWQ_STAT_CPU_TIME = prog['PWQ_STAT_CPU_TIME'] # total CPU time consumed PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups +PWQ_STAT_REPATRIATED = prog['PWQ_STAT_REPATRIATED'] # unbound workers brought back into scope PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer PWQ_NR_STATS = prog['PWQ_NR_STATS'] @@ -89,22 +93,25 @@ class WqStats: 'cpu_time' : self.stats[PWQ_STAT_CPU_TIME], 'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE], 'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP], + 'repatriated' : self.stats[PWQ_STAT_REPATRIATED], 'mayday' : self.stats[PWQ_STAT_MAYDAY], 'rescued' : self.stats[PWQ_STAT_RESCUED], } def table_header_str(): return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\ - f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}' + f'{"CPUitsv":>7} {"CMW/RPR":>7} {"mayday":>7} {"rescued":>7}' def table_row_str(self): cpu_intensive = '-' - cm_wakeup = '-' + cmw_rpr = '-' mayday = '-' rescued = '-' - if not self.unbound: + if self.unbound: + cmw_rpr = str(self.stats[PWQ_STAT_REPATRIATED]); + else: cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE]) - cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP]) + cmw_rpr = str(self.stats[PWQ_STAT_CM_WAKEUP]) if self.mem_reclaim: mayday = str(self.stats[PWQ_STAT_MAYDAY]) @@ -115,7 +122,7 @@ class WqStats: f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \ f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \ f'{cpu_intensive:>7} ' \ - f'{cm_wakeup:>7} ' \ + f'{cmw_rpr:>7} ' \ f'{mayday:>7} ' \ f'{rescued:>7} ' return out.rstrip(':') From 7dbf15c5c05e835d488e0fee49a35b0f23452e45 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:25 -1000 Subject: [PATCH 28/31] workqueue: Add "Affinity Scopes and Performance" section to documentation With affinity scopes and their strictness setting added, unbound workqueues should now be able to cover wide variety of configurations and use cases. Unfortunately, the performance picture is not entirely straight-forward due to a trade-off between efficiency and work-conservation in some situations necessitating manual configuration. This patch adds "Affinity Scopes and Performance" section to Documentation/core-api/workqueue.rst which illustrates the trade-off with a set of experiments and provides some guidelines. Signed-off-by: Tejun Heo --- Documentation/core-api/workqueue.rst | 184 ++++++++++++++++++++++++++- 1 file changed, 179 insertions(+), 5 deletions(-) diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index c73a6df6a118..4a8e764f41ae 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -1,6 +1,6 @@ -==================================== -Concurrency Managed Workqueue (cmwq) -==================================== +========= +Workqueue +========= :Date: September, 2010 :Author: Tejun Heo @@ -25,8 +25,8 @@ there is no work item left on the workqueue the worker becomes idle. When a new work item gets queued, the worker begins executing again. -Why cmwq? -========= +Why Concurrency Managed Workqueue? +================================== In the original wq implementation, a multi threaded (MT) wq had one worker thread per CPU and a single threaded (ST) wq had one worker @@ -408,6 +408,180 @@ directory. behavior of older kernels. +Affinity Scopes and Performance +=============================== + +It'd be ideal if an unbound workqueue's behavior is optimal for vast +majority of use cases without further tuning. Unfortunately, in the current +kernel, there exists a pronounced trade-off between locality and utilization +necessitating explicit configurations when workqueues are heavily used. + +Higher locality leads to higher efficiency where more work is performed for +the same number of consumed CPU cycles. However, higher locality may also +cause lower overall system utilization if the work items are not spread +enough across the affinity scopes by the issuers. The following performance +testing with dm-crypt clearly illustrates this trade-off. + +The tests are run on a CPU with 12-cores/24-threads split across four L3 +caches (AMD Ryzen 9 3900x). CPU clock boost is turned off for consistency. +``/dev/dm-0`` is a dm-crypt device created on NVME SSD (Samsung 990 PRO) and +opened with ``cryptsetup`` with default settings. + + +Scenario 1: Enough issuers and work spread across the machine +------------------------------------------------------------- + +The command used: :: + + $ fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=32k --ioengine=libaio \ + --iodepth=64 --runtime=60 --numjobs=24 --time_based --group_reporting \ + --name=iops-test-job --verify=sha512 + +There are 24 issuers, each issuing 64 IOs concurrently. ``--verify=sha512`` +makes ``fio`` generate and read back the content each time which makes +execution locality matter between the issuer and ``kcryptd``. The followings +are the read bandwidths and CPU utilizations depending on different affinity +scope settings on ``kcryptd`` measured over five runs. Bandwidths are in +MiBps, and CPU util in percents. + +.. list-table:: + :widths: 16 20 20 + :header-rows: 1 + + * - Affinity + - Bandwidth (MiBps) + - CPU util (%) + + * - system + - 1159.40 ±1.34 + - 99.31 ±0.02 + + * - cache + - 1166.40 ±0.89 + - 99.34 ±0.01 + + * - cache (strict) + - 1166.00 ±0.71 + - 99.35 ±0.01 + +With enough issuers spread across the system, there is no downside to +"cache", strict or otherwise. All three configurations saturate the whole +machine but the cache-affine ones outperform by 0.6% thanks to improved +locality. + + +Scenario 2: Fewer issuers, enough work for saturation +----------------------------------------------------- + +The command used: :: + + $ fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=32k \ + --ioengine=libaio --iodepth=64 --runtime=60 --numjobs=8 \ + --time_based --group_reporting --name=iops-test-job --verify=sha512 + +The only difference from the previous scenario is ``--numjobs=8``. There are +a third of the issuers but is still enough total work to saturate the +system. + +.. list-table:: + :widths: 16 20 20 + :header-rows: 1 + + * - Affinity + - Bandwidth (MiBps) + - CPU util (%) + + * - system + - 1155.40 ±0.89 + - 97.41 ±0.05 + + * - cache + - 1154.40 ±1.14 + - 96.15 ±0.09 + + * - cache (strict) + - 1112.00 ±4.64 + - 93.26 ±0.35 + +This is more than enough work to saturate the system. Both "system" and +"cache" are nearly saturating the machine but not fully. "cache" is using +less CPU but the better efficiency puts it at the same bandwidth as +"system". + +Eight issuers moving around over four L3 cache scope still allow "cache +(strict)" to mostly saturate the machine but the loss of work conservation +is now starting to hurt with 3.7% bandwidth loss. + + +Scenario 3: Even fewer issuers, not enough work to saturate +----------------------------------------------------------- + +The command used: :: + + $ fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=32k \ + --ioengine=libaio --iodepth=64 --runtime=60 --numjobs=4 \ + --time_based --group_reporting --name=iops-test-job --verify=sha512 + +Again, the only difference is ``--numjobs=4``. With the number of issuers +reduced to four, there now isn't enough work to saturate the whole system +and the bandwidth becomes dependent on completion latencies. + +.. list-table:: + :widths: 16 20 20 + :header-rows: 1 + + * - Affinity + - Bandwidth (MiBps) + - CPU util (%) + + * - system + - 993.60 ±1.82 + - 75.49 ±0.06 + + * - cache + - 973.40 ±1.52 + - 74.90 ±0.07 + + * - cache (strict) + - 828.20 ±4.49 + - 66.84 ±0.29 + +Now, the tradeoff between locality and utilization is clearer. "cache" shows +2% bandwidth loss compared to "system" and "cache (struct)" whopping 20%. + + +Conclusion and Recommendations +------------------------------ + +In the above experiments, the efficiency advantage of the "cache" affinity +scope over "system" is, while consistent and noticeable, small. However, the +impact is dependent on the distances between the scopes and may be more +pronounced in processors with more complex topologies. + +While the loss of work-conservation in certain scenarios hurts, it is a lot +better than "cache (strict)" and maximizing workqueue utilization is +unlikely to be the common case anyway. As such, "cache" is the default +affinity scope for unbound pools. + +* As there is no one option which is great for most cases, workqueue usages + that may consume a significant amount of CPU are recommended to configure + the workqueues using ``apply_workqueue_attrs()`` and/or enable + ``WQ_SYSFS``. + +* An unbound workqueue with strict "cpu" affinity scope behaves the same as + ``WQ_CPU_INTENSIVE`` per-cpu workqueue. There is no real advanage to the + latter and an unbound workqueue provides a lot more flexibility. + +* Affinity scopes are introduced in Linux v6.5. To emulate the previous + behavior, use strict "numa" affinity scope. + +* The loss of work-conservation in non-strict affinity scopes is likely + originating from the scheduler. There is no theoretical reason why the + kernel wouldn't be able to do the right thing and maintain + work-conservation in most cases. As such, it is possible that future + scheduler improvements may make most of these tunables unnecessary. + + Examining Configuration ======================= From 523a301e66afd1ea9856660bcf3cee3a7c84c6dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:25 -1000 Subject: [PATCH 29/31] workqueue: Make default affinity_scope dynamically updatable While workqueue.default_affinity_scope is writable, it only affects workqueues which are created afterwards and isn't very useful. Instead, let's introduce explicit "default" scope and update the effective scope dynamically when workqueue.default_affinity_scope is changed. Signed-off-by: Tejun Heo --- .../admin-guide/kernel-parameters.txt | 8 ++-- Documentation/core-api/workqueue.rst | 9 +++- include/linux/workqueue.h | 3 +- kernel/workqueue.c | 45 ++++++++++++++++--- 4 files changed, 52 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 732c5c7e3fa5..98ffce100a39 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7014,10 +7014,10 @@ information, see the Affinity Scopes section in Documentation/core-api/workqueue.rst. - This can be updated after boot through the matching - file under /sys/module/workqueue/parameters. - However, the changed default will only apply to - unbound workqueues created afterwards. + This can be changed after boot by writing to the + matching /sys/module/workqueue/parameters file. All + workqueues with the "default" affinity scope will be + updated accordignly. workqueue.debug_force_rr_cpu Workqueue used to implicitly guarantee that work diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index 4a8e764f41ae..5d7b01aed1fe 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -358,7 +358,11 @@ on one of the CPUs which share the last level cache with the issuing CPU. Once started, the worker may or may not be allowed to move outside the scope depending on the ``affinity_strict`` setting of the scope. -Workqueue currently supports the following five affinity scopes. +Workqueue currently supports the following affinity scopes. + +``default`` + Use the scope in module parameter ``workqueue.default_affinity_scope`` + which is always set to one of the scopes below. ``cpu`` CPUs are not grouped. A work item issued on one CPU is processed by a @@ -392,6 +396,9 @@ directory. ``affinity_scope`` Read to see the current affinity scope. Write to change. + When default is the current scope, reading this file will also show the + current effective scope in parentheses, for example, ``default (cache)``. + ``affinity_strict`` 0 by default indicating that affinity scopes are not strict. When a work item starts execution, workqueue makes a best-effort attempt to ensure diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0c1cad38f9db..1c1d06804d45 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -126,6 +126,7 @@ struct rcu_work { }; enum wq_affn_scope { + WQ_AFFN_DFL, /* use system default */ WQ_AFFN_CPU, /* one pod per CPU */ WQ_AFFN_SMT, /* one pod poer SMT */ WQ_AFFN_CACHE, /* one pod per LLC */ @@ -133,8 +134,6 @@ enum wq_affn_scope { WQ_AFFN_SYSTEM, /* one pod across the whole system */ WQ_AFFN_NR_TYPES, - - WQ_AFFN_DFL = WQ_AFFN_CACHE, }; /** diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6f6f4f37ceb3..789e11e72a4a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -339,9 +339,10 @@ struct wq_pod_type { }; static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; -static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_DFL; +static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { + [WQ_AFFN_DFL] = "default", [WQ_AFFN_CPU] = "cpu", [WQ_AFFN_SMT] = "smt", [WQ_AFFN_CACHE] = "cache", @@ -3734,7 +3735,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void) goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); - attrs->affn_scope = wq_affn_dfl; + attrs->affn_scope = WQ_AFFN_DFL; return attrs; fail: free_workqueue_attrs(attrs); @@ -3815,7 +3816,18 @@ static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs, static const struct wq_pod_type * wqattrs_pod_type(const struct workqueue_attrs *attrs) { - struct wq_pod_type *pt = &wq_pod_types[attrs->affn_scope]; + enum wq_affn_scope scope; + struct wq_pod_type *pt; + + /* to synchronize access to wq_affn_dfl */ + lockdep_assert_held(&wq_pool_mutex); + + if (attrs->affn_scope == WQ_AFFN_DFL) + scope = wq_affn_dfl; + else + scope = attrs->affn_scope; + + pt = &wq_pod_types[scope]; if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) && likely(pt->nr_pods)) @@ -5847,13 +5859,29 @@ static int parse_affn_scope(const char *val) static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) { - int affn; + struct workqueue_struct *wq; + int affn, cpu; affn = parse_affn_scope(val); if (affn < 0) return affn; + if (affn == WQ_AFFN_DFL) + return -EINVAL; + + cpus_read_lock(); + mutex_lock(&wq_pool_mutex); wq_affn_dfl = affn; + + list_for_each_entry(wq, &workqueues, list) { + for_each_online_cpu(cpu) { + wq_update_pod(wq, cpu, cpu, true); + } + } + + mutex_unlock(&wq_pool_mutex); + cpus_read_unlock(); + return 0; } @@ -6033,8 +6061,13 @@ static ssize_t wq_affn_scope_show(struct device *dev, int written; mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%s\n", - wq_affn_names[wq->unbound_attrs->affn_scope]); + if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL) + written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n", + wq_affn_names[WQ_AFFN_DFL], + wq_affn_names[wq_affn_dfl]); + else + written = scnprintf(buf, PAGE_SIZE, "%s\n", + wq_affn_names[wq->unbound_attrs->affn_scope]); mutex_unlock(&wq->mutex); return written; From b6a46f7263bd8ba0e545d79bd034c412f32b5875 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Tue, 8 Aug 2023 13:03:29 +0100 Subject: [PATCH 30/31] workqueue: Rename rescuer kworker Each CPU-specific and unbound kworker kthread conforms to a particular naming scheme. However, this does not extend to the rescuer kworker. At present, a rescuer kworker is simply named according to its workqueue's name. This can be cryptic. This patch modifies a rescuer to follow the kworker naming scheme. The "R" is indicative of a rescuer and after "-" is its workqueue's name e.g. "kworker/R-ext4-rsv-conver". tj: Use "R" instead of "r" as the prefix to make it more distinctive and consistent with how highpri pools are marked. Signed-off-by: Aaron Tomlin Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 789e11e72a4a..d4364d3d8aaa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4642,7 +4642,7 @@ static int init_rescuer(struct workqueue_struct *wq) } rescuer->rescue_wq = wq; - rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); + rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", From fe48ba7daefe75bbbefa2426deddc05f2d530d2d Mon Sep 17 00:00:00 2001 From: Mirsad Goran Todorovac Date: Sat, 26 Aug 2023 16:51:03 +0200 Subject: [PATCH 31/31] workqueue: fix data race with the pwq->stats[] increment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KCSAN has discovered a data race in kernel/workqueue.c:2598: [ 1863.554079] ================================================================== [ 1863.554118] BUG: KCSAN: data-race in process_one_work / process_one_work [ 1863.554142] write to 0xffff963d99d79998 of 8 bytes by task 5394 on cpu 27: [ 1863.554154] process_one_work (kernel/workqueue.c:2598) [ 1863.554166] worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2752) [ 1863.554177] kthread (kernel/kthread.c:389) [ 1863.554186] ret_from_fork (arch/x86/kernel/process.c:145) [ 1863.554197] ret_from_fork_asm (arch/x86/entry/entry_64.S:312) [ 1863.554213] read to 0xffff963d99d79998 of 8 bytes by task 5450 on cpu 12: [ 1863.554224] process_one_work (kernel/workqueue.c:2598) [ 1863.554235] worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2752) [ 1863.554247] kthread (kernel/kthread.c:389) [ 1863.554255] ret_from_fork (arch/x86/kernel/process.c:145) [ 1863.554266] ret_from_fork_asm (arch/x86/entry/entry_64.S:312) [ 1863.554280] value changed: 0x0000000000001766 -> 0x000000000000176a [ 1863.554295] Reported by Kernel Concurrency Sanitizer on: [ 1863.554303] CPU: 12 PID: 5450 Comm: kworker/u64:1 Tainted: G L 6.5.0-rc6+ #44 [ 1863.554314] Hardware name: ASRock X670E PG Lightning/X670E PG Lightning, BIOS 1.21 04/26/2023 [ 1863.554322] Workqueue: btrfs-endio btrfs_end_bio_work [btrfs] [ 1863.554941] ================================================================== lockdep_invariant_state(true); → pwq->stats[PWQ_STAT_STARTED]++; trace_workqueue_execute_start(work); worker->current_func(work); Moving pwq->stats[PWQ_STAT_STARTED]++; before the line raw_spin_unlock_irq(&pool->lock); resolves the data race without performance penalty. KCSAN detected at least one additional data race: [ 157.834751] ================================================================== [ 157.834770] BUG: KCSAN: data-race in process_one_work / process_one_work [ 157.834793] write to 0xffff9934453f77a0 of 8 bytes by task 468 on cpu 29: [ 157.834804] process_one_work (/home/marvin/linux/kernel/linux_torvalds/kernel/workqueue.c:2606) [ 157.834815] worker_thread (/home/marvin/linux/kernel/linux_torvalds/./include/linux/list.h:292 /home/marvin/linux/kernel/linux_torvalds/kernel/workqueue.c:2752) [ 157.834826] kthread (/home/marvin/linux/kernel/linux_torvalds/kernel/kthread.c:389) [ 157.834834] ret_from_fork (/home/marvin/linux/kernel/linux_torvalds/arch/x86/kernel/process.c:145) [ 157.834845] ret_from_fork_asm (/home/marvin/linux/kernel/linux_torvalds/arch/x86/entry/entry_64.S:312) [ 157.834859] read to 0xffff9934453f77a0 of 8 bytes by task 214 on cpu 7: [ 157.834868] process_one_work (/home/marvin/linux/kernel/linux_torvalds/kernel/workqueue.c:2606) [ 157.834879] worker_thread (/home/marvin/linux/kernel/linux_torvalds/./include/linux/list.h:292 /home/marvin/linux/kernel/linux_torvalds/kernel/workqueue.c:2752) [ 157.834890] kthread (/home/marvin/linux/kernel/linux_torvalds/kernel/kthread.c:389) [ 157.834897] ret_from_fork (/home/marvin/linux/kernel/linux_torvalds/arch/x86/kernel/process.c:145) [ 157.834907] ret_from_fork_asm (/home/marvin/linux/kernel/linux_torvalds/arch/x86/entry/entry_64.S:312) [ 157.834920] value changed: 0x000000000000052a -> 0x0000000000000532 [ 157.834933] Reported by Kernel Concurrency Sanitizer on: [ 157.834941] CPU: 7 PID: 214 Comm: kworker/u64:2 Tainted: G L 6.5.0-rc7-kcsan-00169-g81eaf55a60fc #4 [ 157.834951] Hardware name: ASRock X670E PG Lightning/X670E PG Lightning, BIOS 1.21 04/26/2023 [ 157.834958] Workqueue: btrfs-endio btrfs_end_bio_work [btrfs] [ 157.835567] ================================================================== in code: trace_workqueue_execute_end(work, worker->current_func); → pwq->stats[PWQ_STAT_COMPLETED]++; lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); which needs to be resolved separately. Fixes: 725e8ec59c56c ("workqueue: Add pwq->stats[] and a monitoring script") Cc: Tejun Heo Suggested-by: Lai Jiangshan Link: https://lore.kernel.org/lkml/20230818194448.29672-1-mirsad.todorovac@alu.unizg.hr/ Signed-off-by: Mirsad Goran Todorovac Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d4364d3d8aaa..c85825e17df8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2599,6 +2599,7 @@ __acquires(&pool->lock) */ set_work_pool_and_clear_pending(work, pool->id); + pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); lock_map_acquire(&pwq->wq->lockdep_map); @@ -2625,7 +2626,6 @@ __acquires(&pool->lock) * workqueues), so hiding them isn't a problem. */ lockdep_invariant_state(true); - pwq->stats[PWQ_STAT_STARTED]++; trace_workqueue_execute_start(work); worker->current_func(work); /*