Utilize set_bit() and test_bit() on worker->flags within io_uring/io-wq
to address potential data races.
The structure io_worker->flags may be accessed through parallel data
paths, leading to concurrency issues. When KCSAN is enabled, it reveals
data races occurring in io_worker_handle_work and
io_wq_activate_free_worker functions.
BUG: KCSAN: data-race in io_worker_handle_work / io_wq_activate_free_worker
write to 0xffff8885c4246404 of 4 bytes by task 49071 on cpu 28:
io_worker_handle_work (io_uring/io-wq.c:434 io_uring/io-wq.c:569)
io_wq_worker (io_uring/io-wq.c:?)
<snip>
read to 0xffff8885c4246404 of 4 bytes by task 49024 on cpu 5:
io_wq_activate_free_worker (io_uring/io-wq.c:? io_uring/io-wq.c:285)
io_wq_enqueue (io_uring/io-wq.c:947)
io_queue_iowq (io_uring/io_uring.c:524)
io_req_task_submit (io_uring/io_uring.c:1511)
io_handle_tw_list (io_uring/io_uring.c:1198)
Line numbers against commit 18daea77cca6 ("Merge tag 'for-linus' of
git://git.kernel.org/pub/scm/virt/kvm/kvm").
These races involve writes and reads to the same memory location by
different tasks running on different CPUs. To mitigate this, refactor
the code to use atomic operations such as set_bit(), test_bit(), and
clear_bit() instead of basic "and" and "or" operations. This ensures
thread-safe manipulation of worker flags.
Signed-off-by: Breno Leitao <[email protected]>
---
io_uring/io-wq.c | 37 +++++++++++++++++++------------------
1 file changed, 19 insertions(+), 18 deletions(-)
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 522196dfb0ff..6712d70d1f18 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -44,7 +44,7 @@ enum {
*/
struct io_worker {
refcount_t ref;
- unsigned flags;
+ unsigned long flags;
struct hlist_nulls_node nulls_node;
struct list_head all_list;
struct task_struct *task;
@@ -165,7 +165,7 @@ static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq,
static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker)
{
- return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND);
+ return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags));
}
static void io_worker_ref_put(struct io_wq *wq)
@@ -225,7 +225,7 @@ static void io_worker_exit(struct io_worker *worker)
wait_for_completion(&worker->ref_done);
raw_spin_lock(&wq->lock);
- if (worker->flags & IO_WORKER_F_FREE)
+ if (test_bit(IO_WORKER_F_FREE, &worker->flags))
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
raw_spin_unlock(&wq->lock);
@@ -410,7 +410,7 @@ static void io_wq_dec_running(struct io_worker *worker)
struct io_wq_acct *acct = io_wq_get_acct(worker);
struct io_wq *wq = worker->wq;
- if (!(worker->flags & IO_WORKER_F_UP))
+ if (!test_bit(IO_WORKER_F_UP, &worker->flags))
return;
if (!atomic_dec_and_test(&acct->nr_running))
@@ -430,8 +430,8 @@ static void io_wq_dec_running(struct io_worker *worker)
*/
static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker)
{
- if (worker->flags & IO_WORKER_F_FREE) {
- worker->flags &= ~IO_WORKER_F_FREE;
+ if (test_bit(IO_WORKER_F_FREE, &worker->flags)) {
+ clear_bit(IO_WORKER_F_FREE, &worker->flags);
raw_spin_lock(&wq->lock);
hlist_nulls_del_init_rcu(&worker->nulls_node);
raw_spin_unlock(&wq->lock);
@@ -444,8 +444,8 @@ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker)
static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker)
__must_hold(wq->lock)
{
- if (!(worker->flags & IO_WORKER_F_FREE)) {
- worker->flags |= IO_WORKER_F_FREE;
+ if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) {
+ set_bit(IO_WORKER_F_FREE, &worker->flags);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list);
}
}
@@ -631,7 +631,8 @@ static int io_wq_worker(void *data)
bool exit_mask = false, last_timeout = false;
char buf[TASK_COMM_LEN];
- worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
+ set_bit(IO_WORKER_F_UP, &worker->flags);
+ set_bit(IO_WORKER_F_RUNNING, &worker->flags);
snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
set_task_comm(current, buf);
@@ -695,11 +696,11 @@ void io_wq_worker_running(struct task_struct *tsk)
if (!worker)
return;
- if (!(worker->flags & IO_WORKER_F_UP))
+ if (!test_bit(IO_WORKER_F_UP, &worker->flags))
return;
- if (worker->flags & IO_WORKER_F_RUNNING)
+ if (test_bit(IO_WORKER_F_RUNNING, &worker->flags))
return;
- worker->flags |= IO_WORKER_F_RUNNING;
+ set_bit(IO_WORKER_F_RUNNING, &worker->flags);
io_wq_inc_running(worker);
}
@@ -713,12 +714,12 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
if (!worker)
return;
- if (!(worker->flags & IO_WORKER_F_UP))
+ if (!test_bit(IO_WORKER_F_UP, &worker->flags))
return;
- if (!(worker->flags & IO_WORKER_F_RUNNING))
+ if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags))
return;
- worker->flags &= ~IO_WORKER_F_RUNNING;
+ clear_bit(IO_WORKER_F_RUNNING, &worker->flags);
io_wq_dec_running(worker);
}
@@ -732,7 +733,7 @@ static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker,
raw_spin_lock(&wq->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list);
list_add_tail_rcu(&worker->all_list, &wq->all_list);
- worker->flags |= IO_WORKER_F_FREE;
+ set_bit(IO_WORKER_F_FREE, &worker->flags);
raw_spin_unlock(&wq->lock);
wake_up_new_task(tsk);
}
@@ -838,7 +839,7 @@ static bool create_io_worker(struct io_wq *wq, int index)
init_completion(&worker->ref_done);
if (index == IO_WQ_ACCT_BOUND)
- worker->flags |= IO_WORKER_F_BOUND;
+ set_bit(IO_WORKER_F_BOUND, &worker->flags);
tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
if (!IS_ERR(tsk)) {
@@ -924,8 +925,8 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
{
struct io_wq_acct *acct = io_work_get_acct(wq, work);
+ unsigned long work_flags = work->flags;
struct io_cb_cancel_data match;
- unsigned work_flags = work->flags;
bool do_create;
/*
--
2.43.0
On 5/3/24 11:37 AM, Breno Leitao wrote:
> Utilize set_bit() and test_bit() on worker->flags within io_uring/io-wq
> to address potential data races.
>
> The structure io_worker->flags may be accessed through parallel data
> paths, leading to concurrency issues. When KCSAN is enabled, it reveals
> data races occurring in io_worker_handle_work and
> io_wq_activate_free_worker functions.
>
> BUG: KCSAN: data-race in io_worker_handle_work / io_wq_activate_free_worker
> write to 0xffff8885c4246404 of 4 bytes by task 49071 on cpu 28:
> io_worker_handle_work (io_uring/io-wq.c:434 io_uring/io-wq.c:569)
> io_wq_worker (io_uring/io-wq.c:?)
> <snip>
>
> read to 0xffff8885c4246404 of 4 bytes by task 49024 on cpu 5:
> io_wq_activate_free_worker (io_uring/io-wq.c:? io_uring/io-wq.c:285)
> io_wq_enqueue (io_uring/io-wq.c:947)
> io_queue_iowq (io_uring/io_uring.c:524)
> io_req_task_submit (io_uring/io_uring.c:1511)
> io_handle_tw_list (io_uring/io_uring.c:1198)
>
> Line numbers against commit 18daea77cca6 ("Merge tag 'for-linus' of
> git://git.kernel.org/pub/scm/virt/kvm/kvm").
>
> These races involve writes and reads to the same memory location by
> different tasks running on different CPUs. To mitigate this, refactor
> the code to use atomic operations such as set_bit(), test_bit(), and
> clear_bit() instead of basic "and" and "or" operations. This ensures
> thread-safe manipulation of worker flags.
Looks good, a few comments for v2:
> diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
> index 522196dfb0ff..6712d70d1f18 100644
> --- a/io_uring/io-wq.c
> +++ b/io_uring/io-wq.c
> @@ -44,7 +44,7 @@ enum {
> */
> struct io_worker {
> refcount_t ref;
> - unsigned flags;
> + unsigned long flags;
> struct hlist_nulls_node nulls_node;
> struct list_head all_list;
> struct task_struct *task;
This now creates a hole in the struct, maybe move 'lock' up after ref so
that it gets filled and the current hole after 'lock' gets removed as
well?
And then I'd renumber the flags, they take bit offsets, not
masks/values. Otherwise it's a bit confusing for someone reading the
code, using masks with test/set bit functions.
--
Jens Axboe
On 5/3/24 11:37 AM, Breno Leitao wrote:
> @@ -631,7 +631,8 @@ static int io_wq_worker(void *data)
> bool exit_mask = false, last_timeout = false;
> char buf[TASK_COMM_LEN];
>
> - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
> + set_bit(IO_WORKER_F_UP, &worker->flags);
> + set_bit(IO_WORKER_F_RUNNING, &worker->flags);
You could probably just use WRITE_ONCE() here with the mask, as it's
setup side.
--
Jens Axboe
Le 03/05/2024 à 20:41, Jens Axboe a écrit :
> On 5/3/24 11:37 AM, Breno Leitao wrote:
>> @@ -631,7 +631,8 @@ static int io_wq_worker(void *data)
>> bool exit_mask = false, last_timeout = false;
>> char buf[TASK_COMM_LEN];
>>
>> - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
>> + set_bit(IO_WORKER_F_UP, &worker->flags);
>> + set_bit(IO_WORKER_F_RUNNING, &worker->flags);
>
> You could probably just use WRITE_ONCE() here with the mask, as it's
> setup side.
>
Or simply:
set_mask_bits(&worker->flags, 0, IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
CJ
On 5/3/24 1:24 PM, Christophe JAILLET wrote:
> Le 03/05/2024 ? 20:41, Jens Axboe a ?crit :
>> On 5/3/24 11:37 AM, Breno Leitao wrote:
>>> @@ -631,7 +631,8 @@ static int io_wq_worker(void *data)
>>> bool exit_mask = false, last_timeout = false;
>>> char buf[TASK_COMM_LEN];
>>> - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
>>> + set_bit(IO_WORKER_F_UP, &worker->flags);
>>> + set_bit(IO_WORKER_F_RUNNING, &worker->flags);
>>
>> You could probably just use WRITE_ONCE() here with the mask, as it's
>> setup side.
>>
>
> Or simply:
> set_mask_bits(&worker->flags, 0, IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
Looks like overkill, as we don't really need that kind of assurances
here. WRITE_ONCE should be fine. Not that it _really_ matters as it's
not a performance critical part, but it also sends wrong hints to the
reader of the code on which kind of guarantees are needing here.
--
Jens Axboe
Hello Jens,
On Fri, May 03, 2024 at 12:41:38PM -0600, Jens Axboe wrote:
> On 5/3/24 11:37 AM, Breno Leitao wrote:
> > @@ -631,7 +631,8 @@ static int io_wq_worker(void *data)
> > bool exit_mask = false, last_timeout = false;
> > char buf[TASK_COMM_LEN];
> >
> > - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
> > + set_bit(IO_WORKER_F_UP, &worker->flags);
> > + set_bit(IO_WORKER_F_RUNNING, &worker->flags);
>
> You could probably just use WRITE_ONCE() here with the mask, as it's
> setup side.
Nice, I didn't know we could mix and match regular operations and
set_bits(). Digging a bit further, I got that this is possible.
Thanks for the feedback.
On Fri, May 03, 2024 at 12:32:38PM -0600, Jens Axboe wrote:
> On 5/3/24 11:37 AM, Breno Leitao wrote:
> > Utilize set_bit() and test_bit() on worker->flags within io_uring/io-wq
> > to address potential data races.
> >
> > The structure io_worker->flags may be accessed through parallel data
> > paths, leading to concurrency issues. When KCSAN is enabled, it reveals
> > data races occurring in io_worker_handle_work and
> > io_wq_activate_free_worker functions.
> >
> > BUG: KCSAN: data-race in io_worker_handle_work / io_wq_activate_free_worker
> > write to 0xffff8885c4246404 of 4 bytes by task 49071 on cpu 28:
> > io_worker_handle_work (io_uring/io-wq.c:434 io_uring/io-wq.c:569)
> > io_wq_worker (io_uring/io-wq.c:?)
> > <snip>
> >
> > read to 0xffff8885c4246404 of 4 bytes by task 49024 on cpu 5:
> > io_wq_activate_free_worker (io_uring/io-wq.c:? io_uring/io-wq.c:285)
> > io_wq_enqueue (io_uring/io-wq.c:947)
> > io_queue_iowq (io_uring/io_uring.c:524)
> > io_req_task_submit (io_uring/io_uring.c:1511)
> > io_handle_tw_list (io_uring/io_uring.c:1198)
> >
> > Line numbers against commit 18daea77cca6 ("Merge tag 'for-linus' of
> > git://git.kernel.org/pub/scm/virt/kvm/kvm").
> >
> > These races involve writes and reads to the same memory location by
> > different tasks running on different CPUs. To mitigate this, refactor
> > the code to use atomic operations such as set_bit(), test_bit(), and
> > clear_bit() instead of basic "and" and "or" operations. This ensures
> > thread-safe manipulation of worker flags.
>
> Looks good, a few comments for v2:
>
> > diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
> > index 522196dfb0ff..6712d70d1f18 100644
> > --- a/io_uring/io-wq.c
> > +++ b/io_uring/io-wq.c
> > @@ -44,7 +44,7 @@ enum {
> > */
> > struct io_worker {
> > refcount_t ref;
> > - unsigned flags;
> > + unsigned long flags;
> > struct hlist_nulls_node nulls_node;
> > struct list_head all_list;
> > struct task_struct *task;
>
> This now creates a hole in the struct, maybe move 'lock' up after ref so
> that it gets filled and the current hole after 'lock' gets removed as
> well?
I am not sure I can see it. From my tests, we got the same hole, and the
struct size is the same. This is what I got with the change:
struct io_worker {
refcount_t ref; /* 0 4 */
/* XXX 4 bytes hole, try to pack */
raw_spinlock_t lock; /* 8 64 */
/* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
<snip>
/* size: 336, cachelines: 6, members: 14 */
/* sum members: 328, holes: 2, sum holes: 8 */
/* forced alignments: 2, forced holes: 1, sum forced holes: 4 */
/* last cacheline: 16 bytes */
} __attribute__((__aligned__(8)));
This is what this current patch returns:
struct io_worker {
refcount_t ref; /* 0 4 */
/* XXX 4 bytes hole, try to pack */
long unsigned int flags; /* 8 8 */
<snip>
/* size: 336, cachelines: 6, members: 14 */
/* sum members: 328, holes: 2, sum holes: 8 */
/* forced alignments: 2, forced holes: 1, sum forced holes: 4 */
/* last cacheline: 16 bytes */
} __attribute__((__aligned__(8)));
A possible suggestion is to move `create_index` after `ref. Then we can
get a more packed structure:
struct io_worker {
refcount_t ref; /* 0 4 */
int create_index; /* 4 4 */
long unsigned int flags; /* 8 8 */
struct hlist_nulls_node nulls_node; /* 16 16 */
struct list_head all_list; /* 32 16 */
struct task_struct * task; /* 48 8 */
struct io_wq * wq; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct io_wq_work * cur_work; /* 64 8 */
struct io_wq_work * next_work; /* 72 8 */
raw_spinlock_t lock; /* 80 64 */
/* --- cacheline 2 boundary (128 bytes) was 16 bytes ago --- */
struct completion ref_done; /* 144 88 */
/* --- cacheline 3 boundary (192 bytes) was 40 bytes ago --- */
long unsigned int create_state; /* 232 8 */
struct callback_head create_work __attribute__((__aligned__(8))); /* 240 16 */
/* --- cacheline 4 boundary (256 bytes) --- */
union {
struct callback_head rcu __attribute__((__aligned__(8))); /* 256 16 */
struct work_struct work; /* 256 72 */
} __attribute__((__aligned__(8))); /* 256 72 */
/* size: 328, cachelines: 6, members: 14 */
/* forced alignments: 2 */
/* last cacheline: 8 bytes */
} __attribute__((__aligned__(8)));
How does it sound?
> And then I'd renumber the flags, they take bit offsets, not
> masks/values. Otherwise it's a bit confusing for someone reading the
> code, using masks with test/set bit functions.
Good point. What about something like?
enum {
IO_WORKER_F_UP = 0, /* up and active */
IO_WORKER_F_RUNNING = 1, /* account as running */
IO_WORKER_F_FREE = 2, /* worker on free list */
IO_WORKER_F_BOUND = 3, /* is doing bounded work */
};
Since we are now using WRITE_ONCE() in io_wq_worker, I am wondering if
this is what we want to do?
WRITE_ONCE(worker->flags, (IO_WORKER_F_UP| IO_WORKER_F_RUNNING) << 1);
Thanks
On Tue, May 07, 2024 at 03:44:54AM -0700, Breno Leitao wrote:
> Since we are now using WRITE_ONCE() in io_wq_worker, I am wondering if
> this is what we want to do?
>
> WRITE_ONCE(worker->flags, (IO_WORKER_F_UP| IO_WORKER_F_RUNNING) << 1);
In fact, we can't clear flags here, so, more correct approach will be:
WRITE_ONCE(worker->flags, READ_ONCE(worker->flags) | (IO_WORKER_F_UP | IO_WORKER_F_RUNNING) << 1);
Does it sound reasonable?
Thanks
On 5/7/24 5:02 AM, Breno Leitao wrote:
> On Tue, May 07, 2024 at 03:44:54AM -0700, Breno Leitao wrote:
>> Since we are now using WRITE_ONCE() in io_wq_worker, I am wondering if
>> this is what we want to do?
>>
>> WRITE_ONCE(worker->flags, (IO_WORKER_F_UP| IO_WORKER_F_RUNNING) << 1);
>
> In fact, we can't clear flags here, so, more correct approach will be:
>
> WRITE_ONCE(worker->flags, READ_ONCE(worker->flags) | (IO_WORKER_F_UP | IO_WORKER_F_RUNNING) << 1);
>
> Does it sound reasonable?
Either that, or since we aren't just assigning the startup bits, maybe
just use set_mask_bits() like Christophe suggested and not worry about
it?
--
Jens Axboe