2023-10-10 03:29:17

by lai bin

[permalink] [raw]
Subject: [PATCH] sched/wait: introduce endmark in __wake_up_common

Without this patch applied, it can cause the waker to fall into an
infinite loop in some cases. The commit 2554db916586 ("sched/wait: Break
up long wake list walk") introduces WQ_FLAG_BOOKMARK to break up long
wake list walk. When the number of walked entries reach 64, the waker
will record scan position and release the queue lock, which reduces
interrupts and rescheduling latency.

Let's take an example, ltp-aiodio case of runltp-ng create 100 processes
for io testing. These processes write the same test file and wait for page
writing complete. Because these processes are all writing to the same
file, they may all be waiting for the writeback bit of the same page
to be cleared. When the page writeback is completed, the end_page_writeback
will clear the writeback bit of the page and wake up all processes on the
wake list for the page. At the same time, another process could submit
the page and set the writeback bit again. When the awakened processes find
that the writeback bit has not been cleared, thery will try to add
themselves to the wake list immediately. Because of the WQ_FLAG_BOOKMARK
feature, the awakened processes will be added to the tail of wake list
again after the waker releases the queue lock. It causes the waker to
fall into an infinite loop.

Therefore, we introduce the endmark to indicate the end of bookmark state.
When we get the endmark entry, stop placing the bookmark flag and hold the
lock until all remaining entries have been walked.

Signed-off-by: Bin Lai <[email protected]>

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5ec7739400f4..3413babd2db4 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -213,7 +213,8 @@ int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *
void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
- unsigned int mode, void *key, wait_queue_entry_t *bookmark);
+ unsigned int mode, void *key, wait_queue_entry_t *bookmark,
+ wait_queue_entry_t *endmark);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 802d98cf2de3..9ecb59193710 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -79,10 +79,11 @@ EXPORT_SYMBOL(remove_wait_queue);
*/
static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key,
- wait_queue_entry_t *bookmark)
+ wait_queue_entry_t *bookmark,
+ wait_queue_entry_t *endmark)
{
wait_queue_entry_t *curr, *next;
- int cnt = 0;
+ int cnt = 0, touch_endmark = 0;

lockdep_assert_held(&wq_head->lock);

@@ -95,12 +96,17 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);

if (&curr->entry == &wq_head->head)
- return nr_exclusive;
+ goto out;

list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
int ret;

+ if (curr == endmark) {
+ touch_endmark = 1;
+ continue;
+ }
+
if (flags & WQ_FLAG_BOOKMARK)
continue;

@@ -110,14 +116,24 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;

- if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+ if (bookmark && !touch_endmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
(&next->entry != &wq_head->head)) {
bookmark->flags = WQ_FLAG_BOOKMARK;
list_add_tail(&bookmark->entry, &next->entry);
- break;
+
+ if (endmark && !(endmark->flags & WQ_FLAG_BOOKMARK)) {
+ endmark->flags = WQ_FLAG_BOOKMARK;
+ list_add_tail(&endmark->entry, &wq_head->head);
+ }
+
+ return nr_exclusive;
}
}

+out:
+ if (endmark && (endmark->flags & WQ_FLAG_BOOKMARK))
+ list_del(&endmark->entry);
+
return nr_exclusive;
}

@@ -125,7 +141,7 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m
int nr_exclusive, int wake_flags, void *key)
{
unsigned long flags;
- wait_queue_entry_t bookmark;
+ wait_queue_entry_t bookmark, endmark;
int remaining = nr_exclusive;

bookmark.flags = 0;
@@ -133,10 +149,15 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m
bookmark.func = NULL;
INIT_LIST_HEAD(&bookmark.entry);

+ endmark.flags = 0;
+ endmark.private = NULL;
+ endmark.func = NULL;
+ INIT_LIST_HEAD(&endmark.entry);
+
do {
spin_lock_irqsave(&wq_head->lock, flags);
remaining = __wake_up_common(wq_head, mode, remaining,
- wake_flags, key, &bookmark);
+ wake_flags, key, &bookmark, &endmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
} while (bookmark.flags & WQ_FLAG_BOOKMARK);

@@ -171,20 +192,21 @@ void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode
*/
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL, NULL, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);

void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, 0, key, NULL);
+ __wake_up_common(wq_head, mode, 1, 0, key, NULL, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);

void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
- unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+ unsigned int mode, void *key, wait_queue_entry_t *bookmark,
+ wait_queue_entry_t *endmark)
{
- __wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+ __wake_up_common(wq_head, mode, 1, 0, key, bookmark, endmark);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);

@@ -233,7 +255,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+ __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);

diff --git a/mm/filemap.c b/mm/filemap.c
index 4ea4387053e8..49dc8620271d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1135,7 +1135,7 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
wait_queue_head_t *q = folio_waitqueue(folio);
struct wait_page_key key;
unsigned long flags;
- wait_queue_entry_t bookmark;
+ wait_queue_entry_t bookmark, endmark;

key.folio = folio;
key.bit_nr = bit_nr;
@@ -1146,8 +1146,13 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
bookmark.func = NULL;
INIT_LIST_HEAD(&bookmark.entry);

+ endmark.flags = 0;
+ endmark.private = NULL;
+ endmark.func = NULL;
+ INIT_LIST_HEAD(&endmark.entry);
+
spin_lock_irqsave(&q->lock, flags);
- __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+ __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark, &endmark);

while (bookmark.flags & WQ_FLAG_BOOKMARK) {
/*
@@ -1159,7 +1164,7 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
spin_unlock_irqrestore(&q->lock, flags);
cpu_relax();
spin_lock_irqsave(&q->lock, flags);
- __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+ __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark, &endmark);
}

/*
--
2.39.3


2023-10-10 03:58:43

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH] sched/wait: introduce endmark in __wake_up_common

On Tue, Oct 10, 2023 at 11:28:33AM +0800, Bin Lai wrote:
> Without this patch applied, it can cause the waker to fall into an
> infinite loop in some cases. The commit 2554db916586 ("sched/wait: Break
> up long wake list walk") introduces WQ_FLAG_BOOKMARK to break up long
> wake list walk. When the number of walked entries reach 64, the waker
> will record scan position and release the queue lock, which reduces
> interrupts and rescheduling latency.

Maybe we could try removing bookmarks instead? There was a thread a
while ago where we agreed they weren't actually useful any more.

Patches to follow ...

2023-10-10 03:59:10

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 1/2] filemap: Remove use of wait bookmarks

The original problem of the overly long list of waiters on a
locked page was solved properly by commit 9a1ea439b16b ("mm:
put_and_wait_on_page_locked() while page is migrated"). In
the meantime, using bookmarks for the writeback bit can cause
livelocks, so we need to stop using them.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
mm/filemap.c | 21 +--------------------
1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index f0a15ce1bd1b..1f9adad2d781 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1135,32 +1135,13 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
wait_queue_head_t *q = folio_waitqueue(folio);
struct wait_page_key key;
unsigned long flags;
- wait_queue_entry_t bookmark;

key.folio = folio;
key.bit_nr = bit_nr;
key.page_match = 0;

- bookmark.flags = 0;
- bookmark.private = NULL;
- bookmark.func = NULL;
- INIT_LIST_HEAD(&bookmark.entry);
-
spin_lock_irqsave(&q->lock, flags);
- __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
-
- while (bookmark.flags & WQ_FLAG_BOOKMARK) {
- /*
- * Take a breather from holding the lock,
- * allow pages that finish wake up asynchronously
- * to acquire the lock and remove themselves
- * from wait queue
- */
- spin_unlock_irqrestore(&q->lock, flags);
- cpu_relax();
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
- }
+ __wake_up_locked_key(q, TASK_NORMAL, &key);

/*
* It's possible to miss clearing waiters here, when we woke our page
--
2.40.1

2023-10-10 03:59:14

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 2/2] sched: Remove wait bookmarks

There are no users of wait bookmarks left, so simplify the wait
code by removing them.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
include/linux/wait.h | 9 +++----
kernel/sched/wait.c | 60 ++++++++------------------------------------
2 files changed, 13 insertions(+), 56 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5ec7739400f4..3473b663176f 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -19,10 +19,9 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE 0x01
#define WQ_FLAG_WOKEN 0x02
-#define WQ_FLAG_BOOKMARK 0x04
-#define WQ_FLAG_CUSTOM 0x08
-#define WQ_FLAG_DONE 0x10
-#define WQ_FLAG_PRIORITY 0x20
+#define WQ_FLAG_CUSTOM 0x04
+#define WQ_FLAG_DONE 0x08
+#define WQ_FLAG_PRIORITY 0x10

/*
* A single wait-queue entry structure:
@@ -212,8 +211,6 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
- unsigned int mode, void *key, wait_queue_entry_t *bookmark);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 802d98cf2de3..51e38f5f4701 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -57,13 +57,6 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
}
EXPORT_SYMBOL(remove_wait_queue);

-/*
- * Scan threshold to break wait queue walk.
- * This allows a waker to take a break from holding the
- * wait queue lock during the wait queue walk.
- */
-#define WAITQUEUE_WALK_BREAK_CNT 64
-
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
@@ -78,21 +71,13 @@ EXPORT_SYMBOL(remove_wait_queue);
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key,
- wait_queue_entry_t *bookmark)
+ int nr_exclusive, int wake_flags, void *key)
{
wait_queue_entry_t *curr, *next;
- int cnt = 0;

lockdep_assert_held(&wq_head->lock);

- if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
- curr = list_next_entry(bookmark, entry);
-
- list_del(&bookmark->entry);
- bookmark->flags = 0;
- } else
- curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+ curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);

if (&curr->entry == &wq_head->head)
return nr_exclusive;
@@ -101,21 +86,11 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
unsigned flags = curr->flags;
int ret;

- if (flags & WQ_FLAG_BOOKMARK)
- continue;
-
ret = curr->func(curr, mode, wake_flags, key);
if (ret < 0)
break;
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
-
- if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
- (&next->entry != &wq_head->head)) {
- bookmark->flags = WQ_FLAG_BOOKMARK;
- list_add_tail(&bookmark->entry, &next->entry);
- break;
- }
}

return nr_exclusive;
@@ -125,20 +100,12 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m
int nr_exclusive, int wake_flags, void *key)
{
unsigned long flags;
- wait_queue_entry_t bookmark;
- int remaining = nr_exclusive;
+ int remaining;

- bookmark.flags = 0;
- bookmark.private = NULL;
- bookmark.func = NULL;
- INIT_LIST_HEAD(&bookmark.entry);
-
- do {
- spin_lock_irqsave(&wq_head->lock, flags);
- remaining = __wake_up_common(wq_head, mode, remaining,
- wake_flags, key, &bookmark);
- spin_unlock_irqrestore(&wq_head->lock, flags);
- } while (bookmark.flags & WQ_FLAG_BOOKMARK);
+ spin_lock_irqsave(&wq_head->lock, flags);
+ remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags,
+ key);
+ spin_unlock_irqrestore(&wq_head->lock, flags);

return nr_exclusive - remaining;
}
@@ -171,23 +138,16 @@ void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode
*/
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);

void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, 0, key, NULL);
+ __wake_up_common(wq_head, mode, 1, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);

-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
- unsigned int mode, void *key, wait_queue_entry_t *bookmark)
-{
- __wake_up_common(wq_head, mode, 1, 0, key, bookmark);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
-
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
@@ -233,7 +193,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+ __wake_up_common(wq_head, mode, 1, WF_SYNC, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);

--
2.40.1

2023-10-10 07:04:29

by lai bin

[permalink] [raw]
Subject: Re: [PATCH] sched/wait: introduce endmark in __wake_up_common

Yes, you are right. Now that bookmarks weren't actually useful any
more, removing bookmarks is a better way to address the livelocks.

Matthew Wilcox <[email protected]> 于2023年10月10日周二 11:58写道:
>
> On Tue, Oct 10, 2023 at 11:28:33AM +0800, Bin Lai wrote:
> > Without this patch applied, it can cause the waker to fall into an
> > infinite loop in some cases. The commit 2554db916586 ("sched/wait: Break
> > up long wake list walk") introduces WQ_FLAG_BOOKMARK to break up long
> > wake list walk. When the number of walked entries reach 64, the waker
> > will record scan position and release the queue lock, which reduces
> > interrupts and rescheduling latency.
>
> Maybe we could try removing bookmarks instead? There was a thread a
> while ago where we agreed they weren't actually useful any more.
>
> Patches to follow ...

2023-10-10 07:13:30

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched: Remove wait bookmarks


* Matthew Wilcox (Oracle) <[email protected]> wrote:

> There are no users of wait bookmarks left, so simplify the wait
> code by removing them.
>
> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
>
> include/linux/wait.h | 9 +++----
> kernel/sched/wait.c | 60 ++++++++------------------------------------
> 2 files changed, 13 insertions(+), 56 deletions(-)

Acked-by: Ingo Molnar <[email protected]>

Thanks,

Ingo