Hi folks,
This revision is mostly about getting work out of the timer callback and
into the new idle worker culling work item.
Revisions
=========
v5 -> v6
++++++++
o Rebase onto v6.1-rc7
o Get rid of worker_pool.idle_cull_list; only do minimal amount of work in the
timer callback (Tejun)
o Dropped the too_many_workers() -> nr_workers_to_cull() change
v4 -> v5
++++++++
o Rebase onto v6.1-rc6
o Overall renaming from "reaping" to "cull"
I somehow convinced myself this was more appropriate
o Split the dwork into timer callback + work item (Tejun)
I didn't want to have redudant operations happen in the timer callback and in
the work item, so I made the timer callback detect which workers are "ripe"
enough and then toss them to a worker for removal.
This however means we release the pool->lock before getting to actually doing
anything to those idle workers, which means they can wake up in the meantime.
The new worker_pool.idle_cull_list is there for that reason.
The alternative was to have the timer callback detect if any worker was ripe
enough, kick the work item if so, and have the work item do the same thing
again, which I didn't like.
RFCv3 -> v4
+++++++++++
o Rebase onto v6.0
o Split into more patches for reviewability
o Take dying workers out of the pool->workers as suggested by Lai
RFCv2 -> RFCv3
++++++++++++++
o Rebase onto v5.19
o Add new patch (1/3) around accessing wq_unbound_cpumask
o Prevent WORKER_DIE workers for kfree()'ing themselves before the idle reaper
gets to handle them (Tejun)
Bit of an aside on that: I've been struggling to convince myself this can
happen due to spurious wakeups and would like some help here.
Idle workers are TASK_UNINTERRUPTIBLE, so they can't be woken up by
signals. That state is set *under* pool->lock, and all wakeups (before this
patch) are also done while holding pool->lock.
wake_up_worker() is done under pool->lock AND only wakes a worker on the
pool->idle_list. Thus the to-be-woken worker *cannot* have WORKER_DIE, though
it could gain it *after* being woken but *before* it runs, e.g.:
LOCK pool->lock
wake_up_worker(pool)
wake_up_process(p)
UNLOCK pool->lock
idle_reaper_fn()
LOCK pool->lock
destroy_worker(worker, list);
UNLOCK pool->lock
worker_thread()
goto woke_up;
LOCK pool->lock
READ worker->flags & WORKER_DIE
UNLOCK pool->lock
...
kfree(worker);
reap_worker(worker);
// Uh-oh
... But IMO that's not a spurious wakeup, that's a concurrency issue. I don't
see any spurious/unexpected worker wakeup happening once a worker is off the
pool->idle_list.
RFCv1 -> RFCv2
++++++++++++++
o Change the pool->timer into a delayed_work to have a sleepable context for
unbinding kworkers
Cheers,
Valentin
Lai Jiangshan (1):
workqueue: Protects wq_unbound_cpumask with wq_pool_attach_mutex
Valentin Schneider (3):
workqueue: Factorize unbind/rebind_workers() logic
workqueue: Convert the idle_timer to a timer + work_struct
workqueue: Unbind kworkers before sending them to exit()
kernel/workqueue.c | 195 +++++++++++++++++++++++++++++++++------------
1 file changed, 143 insertions(+), 52 deletions(-)
--
2.31.1
From: Lai Jiangshan <[email protected]>
When unbind_workers() reads wq_unbound_cpumask to set the affinity of
freshly-unbound kworkers, it only holds wq_pool_attach_mutex. This isn't
sufficient as wq_unbound_cpumask is only protected by wq_pool_mutex.
Make wq_unbound_cpumask protected with wq_pool_attach_mutex and also
remove the need of temporary saved_cpumask.
Fixes: 10a5a651e3af ("workqueue: Restrict kworker in the offline CPU pool running on housekeeping CPUs")
Reported-by: Valentin Schneider <[email protected]>
Signed-off-by: Lai Jiangshan <[email protected]>
---
kernel/workqueue.c | 41 ++++++++++++++++-------------------------
1 file changed, 16 insertions(+), 25 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7cd5f5e7e0a1b..8e21c352c1558 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -326,7 +326,7 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
-/* PL: allowable cpus for unbound wqs and work items */
+/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
/* CPU where unbound work was last round robin scheduled from this CPU */
@@ -3952,7 +3952,8 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
- const struct workqueue_attrs *attrs)
+ const struct workqueue_attrs *attrs,
+ const cpumask_var_t unbound_cpumask)
{
struct apply_wqattrs_ctx *ctx;
struct workqueue_attrs *new_attrs, *tmp_attrs;
@@ -3968,14 +3969,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
goto out_free;
/*
- * Calculate the attrs of the default pwq.
+ * Calculate the attrs of the default pwq with unbound_cpumask
+ * which is wq_unbound_cpumask or to set to wq_unbound_cpumask.
* If the user configured cpumask doesn't overlap with the
* wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
*/
copy_workqueue_attrs(new_attrs, attrs);
- cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+ cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask);
if (unlikely(cpumask_empty(new_attrs->cpumask)))
- cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
+ cpumask_copy(new_attrs->cpumask, unbound_cpumask);
/*
* We may create multiple pwqs with differing cpumasks. Make a
@@ -4072,7 +4074,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
wq->flags &= ~__WQ_ORDERED;
}
- ctx = apply_wqattrs_prepare(wq, attrs);
+ ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
if (!ctx)
return -ENOMEM;
@@ -5334,7 +5336,7 @@ void thaw_workqueues(void)
}
#endif /* CONFIG_FREEZER */
-static int workqueue_apply_unbound_cpumask(void)
+static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
{
LIST_HEAD(ctxs);
int ret = 0;
@@ -5350,7 +5352,7 @@ static int workqueue_apply_unbound_cpumask(void)
if (wq->flags & __WQ_ORDERED)
continue;
- ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
+ ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
if (!ctx) {
ret = -ENOMEM;
break;
@@ -5365,6 +5367,11 @@ static int workqueue_apply_unbound_cpumask(void)
apply_wqattrs_cleanup(ctx);
}
+ if (!ret) {
+ mutex_lock(&wq_pool_attach_mutex);
+ cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
+ mutex_unlock(&wq_pool_attach_mutex);
+ }
return ret;
}
@@ -5383,7 +5390,6 @@ static int workqueue_apply_unbound_cpumask(void)
int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
{
int ret = -EINVAL;
- cpumask_var_t saved_cpumask;
/*
* Not excluding isolated cpus on purpose.
@@ -5397,23 +5403,8 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
goto out_unlock;
}
- if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- /* save the old wq_unbound_cpumask. */
- cpumask_copy(saved_cpumask, wq_unbound_cpumask);
-
- /* update wq_unbound_cpumask at first and apply it to wqs. */
- cpumask_copy(wq_unbound_cpumask, cpumask);
- ret = workqueue_apply_unbound_cpumask();
-
- /* restore the wq_unbound_cpumask when failed. */
- if (ret < 0)
- cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+ ret = workqueue_apply_unbound_cpumask(cpumask);
- free_cpumask_var(saved_cpumask);
out_unlock:
apply_wqattrs_unlock();
}
--
2.31.1
Later patches will reuse this code, move it into reusable functions.
Signed-off-by: Valentin Schneider <[email protected]>
---
kernel/workqueue.c | 33 +++++++++++++++++++++------------
1 file changed, 21 insertions(+), 12 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8e21c352c1558..8185a42848c50 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1972,6 +1972,23 @@ static struct worker *create_worker(struct worker_pool *pool)
return NULL;
}
+static void unbind_worker(struct worker *worker)
+{
+ lockdep_assert_held(&wq_pool_attach_mutex);
+
+ kthread_set_per_cpu(worker->task, -1);
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ else
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+}
+
+static void rebind_worker(struct worker *worker, struct worker_pool *pool)
+{
+ kthread_set_per_cpu(worker->task, pool->cpu);
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0);
+}
+
/**
* destroy_worker - destroy a workqueue worker
* @worker: worker to be destroyed
@@ -5008,13 +5025,8 @@ static void unbind_workers(int cpu)
raw_spin_unlock_irq(&pool->lock);
- for_each_pool_worker(worker, pool) {
- kthread_set_per_cpu(worker->task, -1);
- if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
- else
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
- }
+ for_each_pool_worker(worker, pool)
+ unbind_worker(worker);
mutex_unlock(&wq_pool_attach_mutex);
}
@@ -5039,11 +5051,8 @@ static void rebind_workers(struct worker_pool *pool)
* of all workers first and then clear UNBOUND. As we're called
* from CPU_ONLINE, the following shouldn't fail.
*/
- for_each_pool_worker(worker, pool) {
- kthread_set_per_cpu(worker->task, pool->cpu);
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
- pool->attrs->cpumask) < 0);
- }
+ for_each_pool_worker(worker, pool)
+ rebind_worker(worker, pool);
raw_spin_lock_irq(&pool->lock);
--
2.31.1
Hello,
So, this generally looks great to me. Lai, what do you think?
Thanks.
--
tejun
On Thu, Dec 1, 2022 at 5:06 AM Tejun Heo <[email protected]> wrote:
>
> Hello,
>
> So, this generally looks great to me. Lai, what do you think?
>
> Thanks.
>
Hello,
It looks great to me too. (except for a defect in patch4)
Reviewed-by: Lai Jiangshan <[email protected]>
Thanks
Lai