We encountered workloads that have very long wake up list on large
systems. A waker takes a long time to traverse the entire wake list and
execute all the wake functions.
We saw page wait list that are up to 3700+ entries long in tests of large
4 and 8 socket systems. It took 0.8 sec to traverse such list during
wake up. Any other CPU that contends for the list spin lock will spin
for a long time. It is a result of the numa balancing migration of hot
pages that are shared by many threads.
Multiple CPUs waking are queued up behind the lock, and the last one queued
has to wait until all CPUs did all the wakeups.
The page wait list is traversed with interrupt disabled, which caused
various problems. This was the original cause that triggered the NMI
watch dog timer in: https://patchwork.kernel.org/patch/9800303/ . Only
extending the NMI watch dog timer there helped.
This patch bookmarks the waker's scan position in wake list and break
the wake up walk, to allow access to the list before the waker resume
its walk down the rest of the wait list. It lowers the interrupt and
rescheduling latency.
This patch also provides a performance boost when combined with the next
patch to break up page wakeup list walk. We saw 22% improvement in the
will-it-scale file pread2 test on a Xeon Phi system running 256 threads.
Thanks.
Tim
v2:
Merged in Linus' changes to remove the bookmark_wake_function, and
simply access to flags.
Reported-by: Kan Liang <[email protected]>
Tested-by: Kan Liang <[email protected]>
Signed-off-by: Tim Chen <[email protected]>
---
include/linux/wait.h | 1 +
kernel/sched/wait.c | 74 ++++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 61 insertions(+), 14 deletions(-)
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5b74e36..80034e8 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -18,6 +18,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE 0x01
#define WQ_FLAG_WOKEN 0x02
+#define WQ_FLAG_BOOKMARK 0x04
/*
* A single wait-queue entry structure:
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 17f11c6..789dc24 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
}
EXPORT_SYMBOL(remove_wait_queue);
+/*
+ * Scan threshold to break wait queue walk.
+ * This allows a waker to take a break from holding the
+ * wait queue lock during the wait queue walk.
+ */
+#define WAITQUEUE_WALK_BREAK_CNT 64
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
@@ -63,17 +69,64 @@ EXPORT_SYMBOL(remove_wait_queue);
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
-static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key)
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key,
+ wait_queue_entry_t *bookmark)
{
wait_queue_entry_t *curr, *next;
+ int cnt = 0;
+
+ if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+ curr = list_next_entry(bookmark, entry);
- list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
+ list_del(&bookmark->entry);
+ bookmark->flags = 0;
+ } else
+ curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+ if (&curr->entry == &wq_head->head)
+ return nr_exclusive;
+
+ list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
+ if (flags & WQ_FLAG_BOOKMARK)
+ continue;
+
if (curr->func(curr, mode, wake_flags, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
+
+ if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+ (&next->entry != &wq_head->head)) {
+ bookmark->flags = WQ_FLAG_BOOKMARK;
+ list_add_tail(&bookmark->entry, &next->entry);
+ break;
+ }
+ }
+ return nr_exclusive;
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ unsigned long flags;
+ wait_queue_entry_t bookmark;
+
+ bookmark.flags = 0;
+ bookmark.private = NULL;
+ bookmark.func = NULL;
+ INIT_LIST_HEAD(&bookmark.entry);
+
+ spin_lock_irqsave(&wq_head->lock, flags);
+ nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+
+ while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ spin_lock_irqsave(&wq_head->lock, flags);
+ nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+ wake_flags, key, &bookmark);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
}
@@ -90,11 +143,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, void *key)
{
- unsigned long flags;
-
- spin_lock_irqsave(&wq_head->lock, flags);
- __wake_up_common(wq_head, mode, nr_exclusive, 0, key);
- spin_unlock_irqrestore(&wq_head->lock, flags);
+ __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);
@@ -103,13 +152,13 @@ EXPORT_SYMBOL(__wake_up);
*/
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(wq_head, mode, nr, 0, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, 0, key);
+ __wake_up_common(wq_head, mode, 1, 0, key, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -133,7 +182,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, void *key)
{
- unsigned long flags;
int wake_flags = 1; /* XXX WF_SYNC */
if (unlikely(!wq_head))
@@ -142,9 +190,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
if (unlikely(nr_exclusive != 1))
wake_flags = 0;
- spin_lock_irqsave(&wq_head->lock, flags);
- __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
- spin_unlock_irqrestore(&wq_head->lock, flags);
+ __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
--
2.9.4
Now that we have added breaks in the wait queue scan and allow bookmark
on scan position, we put this logic in the wake_up_page_bit function.
We can have very long page wait list in large system where multiple
pages share the same wait list. We break the wake up walk here to allow
other cpus a chance to access the list, and not to disable the interrupts
when traversing the list for too long. This reduces the interrupt and
rescheduling latency, and excessive page wait queue lock hold time.
We have to add logic to detect any new arrivals to appropriately clear
the wait bit on the page only when there are no new waiters for a page.
The break in wait list walk open windows for new arrivals for a page
on the wait list during the wake ups. They could be added at the head
or tail of the wait queue depending on whether they are exclusive in
prepare_to_wait_event. So we can't clear the PageWaiters flag if there
are new arrivals during the wake up process. Otherwise we will skip
the wake_up_page when there are still entries to be woken up.
v2:
Remove bookmark_wake_function
Signed-off-by: Tim Chen <[email protected]>
---
include/linux/wait.h | 7 +++++++
kernel/sched/wait.c | 7 +++++++
mm/filemap.c | 36 ++++++++++++++++++++++++++++++++++--
3 files changed, 48 insertions(+), 2 deletions(-)
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 80034e8..b926960 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -19,6 +19,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
#define WQ_FLAG_EXCLUSIVE 0x01
#define WQ_FLAG_WOKEN 0x02
#define WQ_FLAG_BOOKMARK 0x04
+#define WQ_FLAG_ARRIVALS 0x08
/*
* A single wait-queue entry structure:
@@ -32,6 +33,8 @@ struct wait_queue_entry {
struct wait_queue_head {
spinlock_t lock;
+ unsigned int waker;
+ unsigned int flags;
struct list_head head;
};
typedef struct wait_queue_head wait_queue_head_t;
@@ -52,6 +55,8 @@ struct task_struct;
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \
+ .waker = 0, \
+ .flags = 0, \
.head = { &(name).head, &(name).head } }
#define DECLARE_WAIT_QUEUE_HEAD(name) \
@@ -185,6 +190,8 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+ unsigned int mode, void *key, wait_queue_entry_t *bookmark);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 789dc24..81e7e55 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -162,6 +162,13 @@ void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, vo
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+ unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+{
+ __wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
+
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
diff --git a/mm/filemap.c b/mm/filemap.c
index a497024..a6c7917 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -920,13 +920,41 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
wait_queue_head_t *q = page_waitqueue(page);
struct wait_page_key key;
unsigned long flags;
+ wait_queue_entry_t bookmark;
key.page = page;
key.bit_nr = bit_nr;
key.page_match = 0;
+ bookmark.flags = 0;
+ bookmark.private = NULL;
+ bookmark.func = NULL;
+ INIT_LIST_HEAD(&bookmark.entry);
+
+ spin_lock_irqsave(&q->lock, flags);
+ /* q->flags will be set to WQ_FLAG_ARRIVALS if items added to wait queue */
+ if (!q->waker)
+ q->flags &= ~WQ_FLAG_ARRIVALS;
+ ++ q->waker;
+ __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+ if (!(bookmark.flags & WQ_FLAG_BOOKMARK))
+ goto finish;
+ /*
+ * Take a breather from holding the lock,
+ * allow pages that finish wake up asynchronously
+ * to acquire the lock and remove themselves
+ * from wait queue
+ */
+ spin_unlock_irqrestore(&q->lock, flags);
+
+again:
spin_lock_irqsave(&q->lock, flags);
- __wake_up_locked_key(q, TASK_NORMAL, &key);
+ __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+ if (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ spin_unlock_irqrestore(&q->lock, flags);
+ goto again;
+ }
+finish:
/*
* It is possible for other pages to have collided on the waitqueue
* hash, so in that case check for a page match. That prevents a long-
@@ -936,7 +964,8 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
* and removed them from the waitqueue, but there are still other
* page waiters.
*/
- if (!waitqueue_active(q) || !key.page_match) {
+ if (!waitqueue_active(q) ||
+ (!key.page_match && (q->waker == 1) && !(q->flags & WQ_FLAG_ARRIVALS))) {
ClearPageWaiters(page);
/*
* It's possible to miss clearing Waiters here, when we woke
@@ -946,6 +975,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
* That's okay, it's a rare case. The next waker will clear it.
*/
}
+ -- q->waker;
spin_unlock_irqrestore(&q->lock, flags);
}
@@ -976,6 +1006,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
__add_wait_queue_entry_tail_exclusive(q, wait);
else
__add_wait_queue(q, wait);
+ q->flags = WQ_FLAG_ARRIVALS;
SetPageWaiters(page);
}
@@ -1041,6 +1072,7 @@ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, waiter);
SetPageWaiters(page);
+ q->flags = WQ_FLAG_ARRIVALS;
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL_GPL(add_page_wait_queue);
--
2.9.4
On Fri, 25 Aug 2017, Tim Chen wrote:
> for a long time. It is a result of the numa balancing migration of hot
> pages that are shared by many threads.
I think that would also call for some work to limit numa balacing of hot
shared pages. The cache lines of hot pages are likely in present the low
level processor caches anyways so moving them would not cause a
performance benefit. Limiting the migration there could stop wasting a lot
of effort.
On Fri, Aug 25, 2017 at 9:13 AM, Tim Chen <[email protected]> wrote:
> Now that we have added breaks in the wait queue scan and allow bookmark
> on scan position, we put this logic in the wake_up_page_bit function.
Oh, _this_ is the other patch you were talking about. I thought it was
the NUMA counter threashold that was discussed around the same time,
and that's why I referred to Mel.
Gods, _this_ patch is ugly. No, I'm not happy with it at all. It
makes that wait_queue_head much bigger, for this disgusting one use.
So no, this is no good.
Now, maybe the page_wait_table[] itself could be changed to be
something that is *not* just the wait-queue head.
But if we need to change the page_wait_table[] itself to have more
information, then we should make it not be a wait-queue at all, we
should make it be a list of much more specialized entries, indexed by
the {page,bit} tuple.
And once we do that, we probably *could* use something like two
specialized lists: one that is wake-all, and one that is wake-one.
So you'd have something like
struct page_wait_struct {
struct list_node list;
struct page *page;
int bit;
struct llist_head all;
struct llist_head exclusive;
};
and then the "page_wait_table[]" would be just an array of
struct page_wait_head {
spinlock_t lock;
struct hlist_head list;
};
and then the rule is:
- each page/bit combination allocates one of these page_wait_struct
entries when somebody starts waiting for it for the first time (and we
use the spinlock in the page_wait_head to serialize that list).
- an exclusive wait (ie somebody who waits to get the bit for
locking) adds itself to the 'exclusive' llist
- non-locking waiters add themselves to the 'all' list
- we can use llist_del_all() to remove the 'all' list and then walk
it and wake them up without any locks
- we can use llist_del_first() to remove the first exclusive waiter
and wait _it_ up without any locks.
Hmm? How does that sound? That means that we wouldn't use the normal
wait-queues at all for the page hash waiting. We'd use this two-level
list: one list to find the page/bit thing, and then two lists within
that fdor the wait-queues for just *that* page/bit.
So no need for the 'key' stuff at all, because the page/bit data would
be in the first data list, and the second list wouldn't have any of
these traversal issues where you have to be careful and do it one
entry at a time.
Linus
On 08/25/2017 12:58 PM, Linus Torvalds wrote:
> On Fri, Aug 25, 2017 at 9:13 AM, Tim Chen <[email protected]> wrote:
>> Now that we have added breaks in the wait queue scan and allow bookmark
>> on scan position, we put this logic in the wake_up_page_bit function.
>
> Oh, _this_ is the other patch you were talking about. I thought it was
> the NUMA counter threashold that was discussed around the same time,
> and that's why I referred to Mel.
>
> Gods, _this_ patch is ugly. No, I'm not happy with it at all. It
> makes that wait_queue_head much bigger, for this disgusting one use.
>
> So no, this is no good.
>
> Now, maybe the page_wait_table[] itself could be changed to be
> something that is *not* just the wait-queue head.
>
> But if we need to change the page_wait_table[] itself to have more
> information, then we should make it not be a wait-queue at all, we
> should make it be a list of much more specialized entries, indexed by
> the {page,bit} tuple.
>
> And once we do that, we probably *could* use something like two
> specialized lists: one that is wake-all, and one that is wake-one.
>
> So you'd have something like
>
> struct page_wait_struct {
> struct list_node list;
> struct page *page;
> int bit;
> struct llist_head all;
> struct llist_head exclusive;
> };
>
> and then the "page_wait_table[]" would be just an array of
>
> struct page_wait_head {
> spinlock_t lock;
> struct hlist_head list;
> };
>
> and then the rule is:
>
> - each page/bit combination allocates one of these page_wait_struct
> entries when somebody starts waiting for it for the first time (and we
> use the spinlock in the page_wait_head to serialize that list).
>
> - an exclusive wait (ie somebody who waits to get the bit for
> locking) adds itself to the 'exclusive' llist
>
> - non-locking waiters add themselves to the 'all' list
>
> - we can use llist_del_all() to remove the 'all' list and then walk
> it and wake them up without any locks
>
> - we can use llist_del_first() to remove the first exclusive waiter
> and wait _it_ up without any locks.
>
> Hmm? How does that sound? That means that we wouldn't use the normal
> wait-queues at all for the page hash waiting. We'd use this two-level
> list: one list to find the page/bit thing, and then two lists within
> that fdor the wait-queues for just *that* page/bit.
>
> So no need for the 'key' stuff at all, because the page/bit data would
> be in the first data list, and the second list wouldn't have any of
> these traversal issues where you have to be careful and do it one
> entry at a time.
If I have the waker count and flags on the wait_page_queue structure
will that have been more acceptable?
Also I think patch 1 is still a good idea for a fail safe mechanism
in case there are other long wait list.
That said, I do think your suggested approach is cleaner. However, it
is a much more substantial change. Let me take a look and see if I
have any issues implementing it.
Tim
>
> Linus
>
On Fri, Aug 25, 2017 at 3:19 PM, Tim Chen <[email protected]> wrote:
>
> Also I think patch 1 is still a good idea for a fail safe mechanism
> in case there are other long wait list.
Yeah, I don't hate patch #1.
But that other patch is just nasty.
> That said, I do think your suggested approach is cleaner. However, it
> is a much more substantial change. Let me take a look and see if I
> have any issues implementing it.
Actually, I tried it myself.
It was painful. But I actually have a TOTALLY UNTESTED set of two
patches that implements the idea.
And by "implements the idea" I mean "it kind of compiles, and it kind
of looks like it might work".
But by "kind of compiles" I mean that I didn't implement the nasty
add_page_wait_queue() thing that the cachefiles interface wants.
Honestly, I think the sanest way to do that is to just have a hashed
wait queue *just* for cachefiles.
And by "kind of looks like it might work" I really mean just that.
It's entirely untested. It's more of a "let's take that description of
mine and turn it into code". I really have not tested this AT ALL.
And it's subtle enough that I suspect it really is majorly buggy. It
uses the locking hash list code (hlist_bl_node) to keep the hash table
fairly small and hide the lock in the hash table itself.
And then it plays horrible games with linked lists. Yeah, that may
have been a mistake, but I thought I should try to avoid the doubly
linked lists in that "struct page_wait_struct" because it's allocated
on the stack, and each list_head is 16 bytes on 64-bit architectures.
But that "let's save 24 bytes in the structure" made it much nastier
to remove entries, so it was probably a bad trade-off.
But I'm attaching the two patches because I have no shame. If somebody
is willing to look at my completely untested crap code.
I *really* want to emphasize that "untested crap".
This is meant to be example code of the *concept* rather than anything
that actually works.
So take it as that: example pseudo-code that happens to pass a
compiler, but is meant as a RFD rather than actually working.
The first patch just moves code around because I wanted to experiment
with the new code in a new file. That first patch is probably fine. It
shouldn't change any code, just move it.
The second patch is the "broken patch to illustrate the idea".
Linus
On Fri, Aug 25, 2017 at 3:51 PM, Linus Torvalds
<[email protected]> wrote:
>
> So take it as that: example pseudo-code that happens to pass a
> compiler, but is meant as a RFD rather than actually working.
Oh, and after I sent it out, I wanted to look once again, and realized
that the "remove_myself_from()" function is entirely broken.
The caller has already removed the page entry, we don't want to remove
it again, we want to add a *new* one with us removed from it.
So here's an updated 2/2 patch with that fixed.
Let this be a lesson in just *how* little tested, and *how* crap that
patch probably still is.
Linus
On Fri, Aug 25, 2017 at 4:03 PM, Linus Torvalds
<[email protected]> wrote:
>
> Let this be a lesson in just *how* little tested, and *how* crap that
> patch probably still is.
I haven't had time to look at it any more (trying to merge the pull
requests that came in today instead), but the more I think about it,
the more I think it was a mistake to do that page_wait_struct
allocation on the stack.
It made it way more fragile and complicated, having to rewrite things
so carefully. A simple slab cache would likely be a lot cleaner and
simpler.
So even if that thing can be made to work, it's probably not worth the pain.
Linus
On Fri, Aug 25, 2017 at 5:31 PM, Linus Torvalds
<[email protected]> wrote:
>
> It made it way more fragile and complicated, having to rewrite things
> so carefully. A simple slab cache would likely be a lot cleaner and
> simpler.
It also turns out that despite all the interfaces, we only really ever
wait on two different bits: PG_locked and PG_writeback. Nothing else.
Even the add_page_wait_queue() thing, which looks oh-so-generic,
really only waits on PG_locked.
And the PG_writeback case never really cares for the "locked" case, so
this incredibly generic interface that allows you to wait on any bit
you want, and has the whole exclusive wait support for getting
exclusive access to the bit really only has three cases:
- wait for locked exclusive (wake up first waiter when unlocked)
- wait for locked (wake up all waiters when unlocked)
- wait for writeback (wake up all waiters when no longer under writeback)
and those last two could probably even share the same queue.
But even without sharing the same queue, we could just do a per-page
allocation for the three queues - and probably that stupiud
add_page_wait_queue() waitqueue too. So no "per-page and per-bit"
thing, just a per-page thing.
I'll try writing that up.
Simplify, simplify, simplify.
Linus
On Fri, Aug 25, 2017 at 7:54 PM, Linus Torvalds
<[email protected]> wrote:
>
> Simplify, simplify, simplify.
I've now tried three different approaches (I stopped sending broken
patches after deciding the first one was unfixable), and they all
suck.
I was hoping for something lockless to avoid the whole issue with
latency over the long list, but anything that has the wait queue entry
allocated on the stack needs to synchronize the wakeup due to the
stack usage, so once you have lists that are thousands of entries,
either you hold the lock for long times (normal wait queues) or take
and release them constantly (the swait approach), or you batch things
up (Tim's wait-queue patches).
Of those three approaches, the batching does seem to be the best of
the lot. Allocating non-stack wait entries while waiting for pages
seems like a bad idea. We're probably waiting for IO in normal
circumstances, possibly because we're low on memory.
So I *am* ok with the batching (your patch #1), but I'm *not* ok with
the nasty horrible book-keeping to avoid new entries when you batch
and release the lock and that allows new entries on the list (your
patch #2).
That said, I have now stared *way* too much at the page wait code
after having unsuccessfully tried to replace that wait-queue with
other "clever" approaches (all of which ended up being crap).
And I'm starting to see a possible solution, or at least improvement.
Let's just assume I take the batching patch. It's not conceptually
ugly, it's fairly simple, and there are lots of independent arguments
for it (ie latency, but also possible performance from better
parallelism).
That patch doesn't make any data structures bigger or more
complicated, it's just that single "is this a bookmark entry" thing.
So that patch just looks fundamentally fine to me, and the only real
argument I ever had against it was that I would really really _really_
have wanted to root-cause the behavior.
So that leaves my fundamental dislike of your other patch.
And it turns out that all my looking at the page wait code wasn't
entirely unproductive. Yes, I went through three crap approaches
before I gave up on rewriting it, but in the meantime I did get way
too intimate with looking at that pile of crud.
And I think I found the reason why you needed that patch #2 in the first place.
Here's what is going on:
- we're going to assume that the problem is all with a single page,
not due to hash collisions (as per your earlier reports)
- we also know that the only bit that matters is the PG_locked bit
- that means that the only way to get that "multiple concurrent
waker" situation that your patch #2 tries to handle better is because
you have multiple people unlocking the page - since that is what
causes the wakeups
- that in turn means that you obviously had multiple threads *locking* it too.
- and that means that among those new entries there are lockers
coming in in the middle and adding an exclusive entry.
- the exclusive entry already stops the wakeup list walking
- but we add non-exclusive entries TO THE BEGINNING of the page waiters list
And I really think that last thing is fundamentally wrong.
It's wrong for several reasons:
- because it's unfair: threads that want to lock get put behind
threads that just want to see the unlocked state.
- because it's stupid: our non-locking waiters will end up waiting
again if the page got locked, so waking up a locker *and* a lot of
non-locking waiters will just cause them to go back to sleep anyway
- because it causes us to walk longer lists: we stop walking when we
wake up the exclusive waiter, but because we always put the
non-exclusive waiters in there first, we always walk the long list of
non-exclusive waiters even if we could just stop walking because we
woke up an exclusive one.
Now the reason we do this seems to be entirely random: for a *normal*
wait queue, you really want to always wake up all the non-exclusive
waiters, because exclusive waiters are only exclusive wrt each other.
But for a page lock, an exclusive waiter really is getting the lock,
and the other waiters are going to wait for the lock to clear anyway,
so the page wait thing is actually almost exactly the reverse
situation. We *could* put exclusive waiters at the beginning of the
list instead, and actively try to avoid walking the list at all if we
have pending lockers.
I'm not doing that, because I think the fair thing to do is to just do
things in the order they came in. Plus the code is actually simpler if
we just always add to the tail.
Now, the other thing to look at is how the wakeup function works. It
checks the aliasing information (page and bit number), but then it
*also* does:
if (test_bit(key->bit_nr, &key->page->flags))
return 0;
basically saying "continue walking if somebody else already got the bit".
That's *INSANE*. It's exactly the wrong thing to do. It's basically
saying "even if we had an exclusive waiter, let's not wake it up, but
do let us continue to walk the list even though the bit we're waiting
for to clear is set already".
What would be sane is to say "stop walking the list now".. So just do
that - by making "negative return from wake function" mean "stop
walking".
So how about just this fairly trivial patch?
In fact, I think this may help even *without* Tim's patch #1. So I
think this would be lovely to test on that problem load on its own,
and seeing if it makes the wait queues behave better.
It might not cut down on the total length of the wait-queue, but it
should hopefully cause it to walk it much less. We now hit the
exclusive waiters earlier and stop waking things up when we have a new
locker thread pending. And when the page ends up being locked again,
we stop walking the list entirely, so no unnecessarily traversal.
This patch is small and at least minimally works (I'm running it right now).
Linus
On Sat, Aug 26, 2017 at 11:15 AM, Linus Torvalds
<[email protected]> wrote:
>
> So how about just this fairly trivial patch?
So I just committed that trivial patch, because I think it's right,
but more importantly because I think I found a real and non-trivial
fundamental problem.
The reason I found it is actually that I was thinking about this
patch, and how the WQ_FLAG_EXCLUSIVE ordering matters.
And I don't really think the WQ_FLAG_EXCLUSIVE ordering matters all
that much, but just *thinking* about it made me realize that the code
is broken.
In particular, this caller:
int __lock_page_killable(struct page *__page)
{
struct page *page = compound_head(__page);
wait_queue_head_t *q = page_waitqueue(page);
return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
is completely broken crap.
Why?
It's the combination of "TASK_KILLABLE" and "true" that is broken.
Always has been broken, afaik.
The "true" is that "bool lock" argument, and when it is set, we set
the WQ_FLAG_EXCLUSIVE bit.
But that bit - by definition, that's the whole point - means that the
waking side only wakes up *one* waiter.
So there's a race in anybody who uses __lock_page_killable().
The race goes like this:
thread1 thread2 thread3
---- ---- ----
.. CPU1 ...
__lock_page_killable
wait_on_page_bit_common()
get wq lock
__add_wait_queue_entry_tail_exclusive()
set_current_state(TASK_KILLABLE);
release wq lock
io_schedule
... CPU2 ...
__lock_page[_killable]()
wait_on_page_bit_common()
get wq lock
__add_wait_queue_entry_tail_exclusive()
set_current_state(TASK_KILLABLE);
release wq lock
io_schedule
... CPU3 ...
unlock_page()
wake_up_page_bit(page, PG_Locked)
wakes up CPU1 _only_
... lethal signal for thread1 happens ...
if (unlikely(signal_pending_state(state, current))) {
ret = -EINTR;
break;
}
End result: page is unlocked, CPU3 is waiting, nothing will wake CPU3 up.
Of course, if we have multiple threads all locking that page
concurrently, we probably will have *another* thread lock it
(successfully), and then when it unlocks it thread3 does get woken up
eventually.
But the keyword is "eventually". It could be a long while,
particularly if we don't lock the page *all* the time, just
occasionally.
So it might be a while, and it might explain how some waiters might queue up.
And who does __lock_page_killable? Page faults.
And who does a lot of page faults and page locking? That NUMA load from hell.
Does it have lethal signals, though? Probably not. That lethal signal
case really is unusual.
So I'm not saying that this is actually THE BUG. In particular,
despite that race, the page actually *is* unlocked afterwards. It's
just that one of the threads that wanted the lock didn't get notified
of it. So it doesn't really explain how non-locking waiters (ie the
people who don't do migrations, just wait for the migration entry)
would queue up.
But it sure looks like a real bug to me.
Basically, if you ask for anm exclusive wakeup, you *have* to take the
resource you are waiting for. Youl can't just say "never mind, I'll
return -EINTR".
I don't see a simple fix for this yet other than perhaps adding a
wakeup to the "ret = -EINTR; break" case.
Does anybody else see anything? Or maybe see a reason why this
wouldn't be a bug in the first place?
Anyway, I am officially starting to hate that page wait code. I've
stared at it for days now, and I am not getting more fond of it.
Linus
On Sun, Aug 27, 2017 at 2:40 PM, Linus Torvalds
<[email protected]> wrote:
>
> End result: page is unlocked, CPU3 is waiting, nothing will wake CPU3 up.
Not CPU3. CPU3 was the waker. It's thread 2 that is waiting and never
got woken up, of course.
Other than that, the scenario still looks real to me.
Ideas?
Linus
On Sun, Aug 27, 2017 at 2:40 PM, Linus Torvalds
<[email protected]> wrote:
>
> The race goes like this:
>
> thread1 thread2 thread3
> ---- ---- ----
>
> .. CPU1 ...
> __lock_page_killable
> wait_on_page_bit_common()
> get wq lock
> __add_wait_queue_entry_tail_exclusive()
> set_current_state(TASK_KILLABLE);
> release wq lock
> io_schedule
>
> ... CPU2 ...
> __lock_page[_killable]()
> wait_on_page_bit_common()
> get wq lock
> __add_wait_queue_entry_tail_exclusive()
> set_current_state(TASK_KILLABLE);
> release wq lock
> io_schedule
>
> ... CPU3 ...
> unlock_page()
> wake_up_page_bit(page, PG_Locked)
> wakes up CPU1 _only_
>
> ... lethal signal for thread1 happens ...
> if (unlikely(signal_pending_state(state, current))) {
> ret = -EINTR;
> break;
> }
With the race meaning that thread2 never gets woken up due to the
exclusive wakeup being caught by thread1 (which doesn't actually take
the lock).
I think that this bug was introduced by commit 62906027091f ("mm: add
PageWaiters indicating tasks are waiting for a page bit"), which
changed the page lock from using the wait_on_bit_lock() code to its
own _slightly_ different version.
Because it looks like _almost_ the same thing existed in the old
wait_on_bit_lock() code - and that is still used by a couple of
filesystems.
*Most* of the users seem to use TASK_UNINTERRUPTIBLE, which is fine.
But cifs and the sunrpc XPRT_LOCKED code both use the TASK_KILLABLE
form that would seem to have the exact same issue: wait_on_bit_lock()
uses exclusive wait-queues, but then may return with an error without
actually taking the lock.
Now, it turns out that I think the wait_on_bit_lock() code is actually
safe, for a subtle reason.
Why? Unlike the page lock code, the wait_on_bit_lock() code always
tries to get the lock bit before returning an error. So
wait_on_bit_lock() will prefer a successful lock taking over EINTR,
which means that if the bit really was unlocked, it would have been
taken.
And if something else locked the bit again under us and we didn't get
it, that "something else" presumably will then wake things up when it
unlocks.
So the wait_on_bit_lock() code could _also_ lose the wakeup event, but
it would only lose it in situations where somebody else would then
re-send it.
Do people agree with that analysis?
So I think the old wait_on_bit_lock() code ends up being safe -
despite having this same pattern of "exclusive wait but might error
out without taking the lock".
Whether that "wait_on_bit_lock() is safe" was just a fluke or was
because people thought about it is unclear. It's old code. People
probably *did* think about it. I really can't remember.
But it does point to a fix for the page lock case: just move the
signal_pending_state() test to after the bit checking.
So the page locking code is racy because you could have this:
- another cpu does the unlock_page() and wakes up the process (and
uses the exclusive event)
- we then get a lethal signal before we get toi the
"signal_pending_state()" state.
- we end up prioritizing the lethal signal, because obviously we
don't care about locking the page any more.
- so now the lock bit may be still clear and there's nobody who is
going to wake up the remaining waiter
Moving the signal_pending_state() down actually fixes the race,
because we know that in order for the exclusive thing to have
mattered, it *has* to actually wake us up. So the unlock_page() must
happen before the lethal signal (where before is well-defined because
of that "try_to_wake_up()" taking a lock and looking at the task
state). The exclusive accounting is only done if the process is
actually woken up, not if it was already running (see
"try_to_wake_up()").
And if the unlock_page() happened before the lethal signal, then we
know that test_and_set_bit_lock() will either work (in which case
we're ok), or another locker successfully came in later - in which
case we're _also_ ok, because that other locker will then do the
unlock again, and wake up subsequent waiters that might have been
blocked by our first exclusive waiter.
So I propose that the fix might be as simple as this:
diff --git a/mm/filemap.c b/mm/filemap.c
index baba290c276b..0b41c8cbeabc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -986,10 +986,6 @@ static inline int
wait_on_page_bit_common(wait_queue_head_t *q,
if (likely(test_bit(bit_nr, &page->flags))) {
io_schedule();
- if (unlikely(signal_pending_state(state, current))) {
- ret = -EINTR;
- break;
- }
}
if (lock) {
@@ -999,6 +995,11 @@ static inline int
wait_on_page_bit_common(wait_queue_head_t *q,
if (!test_bit(bit_nr, &page->flags))
break;
}
+
+ if (unlikely(signal_pending_state(state, current))) {
+ ret = -EINTR;
+ break;
+ }
}
finish_wait(q, wait);
but maybe I'm missing something.
Nick, comments?
Linus
On Sun, 27 Aug 2017 16:12:19 -0700
Linus Torvalds <[email protected]> wrote:
> On Sun, Aug 27, 2017 at 2:40 PM, Linus Torvalds
> <[email protected]> wrote:
> >
> > The race goes like this:
> >
> > thread1 thread2 thread3
> > ---- ---- ----
> >
> > .. CPU1 ...
> > __lock_page_killable
> > wait_on_page_bit_common()
> > get wq lock
> > __add_wait_queue_entry_tail_exclusive()
> > set_current_state(TASK_KILLABLE);
> > release wq lock
> > io_schedule
> >
> > ... CPU2 ...
> > __lock_page[_killable]()
> > wait_on_page_bit_common()
> > get wq lock
> > __add_wait_queue_entry_tail_exclusive()
> > set_current_state(TASK_KILLABLE);
> > release wq lock
> > io_schedule
> >
> > ... CPU3 ...
> > unlock_page()
> > wake_up_page_bit(page, PG_Locked)
> > wakes up CPU1 _only_
> >
> > ... lethal signal for thread1 happens ...
> > if (unlikely(signal_pending_state(state, current))) {
> > ret = -EINTR;
> > break;
> > }
>
> With the race meaning that thread2 never gets woken up due to the
> exclusive wakeup being caught by thread1 (which doesn't actually take
> the lock).
>
> I think that this bug was introduced by commit 62906027091f ("mm: add
> PageWaiters indicating tasks are waiting for a page bit"), which
> changed the page lock from using the wait_on_bit_lock() code to its
> own _slightly_ different version.
>
> Because it looks like _almost_ the same thing existed in the old
> wait_on_bit_lock() code - and that is still used by a couple of
> filesystems.
>
> *Most* of the users seem to use TASK_UNINTERRUPTIBLE, which is fine.
> But cifs and the sunrpc XPRT_LOCKED code both use the TASK_KILLABLE
> form that would seem to have the exact same issue: wait_on_bit_lock()
> uses exclusive wait-queues, but then may return with an error without
> actually taking the lock.
>
> Now, it turns out that I think the wait_on_bit_lock() code is actually
> safe, for a subtle reason.
>
> Why? Unlike the page lock code, the wait_on_bit_lock() code always
> tries to get the lock bit before returning an error. So
> wait_on_bit_lock() will prefer a successful lock taking over EINTR,
> which means that if the bit really was unlocked, it would have been
> taken.
>
> And if something else locked the bit again under us and we didn't get
> it, that "something else" presumably will then wake things up when it
> unlocks.
>
> So the wait_on_bit_lock() code could _also_ lose the wakeup event, but
> it would only lose it in situations where somebody else would then
> re-send it.
>
> Do people agree with that analysis?
>
> So I think the old wait_on_bit_lock() code ends up being safe -
> despite having this same pattern of "exclusive wait but might error
> out without taking the lock".
>
> Whether that "wait_on_bit_lock() is safe" was just a fluke or was
> because people thought about it is unclear. It's old code. People
> probably *did* think about it. I really can't remember.
>
> But it does point to a fix for the page lock case: just move the
> signal_pending_state() test to after the bit checking.
>
> So the page locking code is racy because you could have this:
>
> - another cpu does the unlock_page() and wakes up the process (and
> uses the exclusive event)
>
> - we then get a lethal signal before we get toi the
> "signal_pending_state()" state.
>
> - we end up prioritizing the lethal signal, because obviously we
> don't care about locking the page any more.
>
> - so now the lock bit may be still clear and there's nobody who is
> going to wake up the remaining waiter
>
> Moving the signal_pending_state() down actually fixes the race,
> because we know that in order for the exclusive thing to have
> mattered, it *has* to actually wake us up. So the unlock_page() must
> happen before the lethal signal (where before is well-defined because
> of that "try_to_wake_up()" taking a lock and looking at the task
> state). The exclusive accounting is only done if the process is
> actually woken up, not if it was already running (see
> "try_to_wake_up()").
>
> And if the unlock_page() happened before the lethal signal, then we
> know that test_and_set_bit_lock() will either work (in which case
> we're ok), or another locker successfully came in later - in which
> case we're _also_ ok, because that other locker will then do the
> unlock again, and wake up subsequent waiters that might have been
> blocked by our first exclusive waiter.
>
> So I propose that the fix might be as simple as this:
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index baba290c276b..0b41c8cbeabc 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -986,10 +986,6 @@ static inline int
> wait_on_page_bit_common(wait_queue_head_t *q,
>
> if (likely(test_bit(bit_nr, &page->flags))) {
> io_schedule();
> - if (unlikely(signal_pending_state(state, current))) {
> - ret = -EINTR;
> - break;
> - }
> }
>
> if (lock) {
> @@ -999,6 +995,11 @@ static inline int
> wait_on_page_bit_common(wait_queue_head_t *q,
> if (!test_bit(bit_nr, &page->flags))
> break;
> }
> +
> + if (unlikely(signal_pending_state(state, current))) {
> + ret = -EINTR;
> + break;
> + }
> }
>
> finish_wait(q, wait);
>
> but maybe I'm missing something.
>
> Nick, comments?
No I don't think you're missing something. We surely could lose our only
wakeup in this window. So an exclusive waiter has to always make sure
they propagate the wakeup (regardless of what they do with the contended
resources itself).
Seems like your fix should solve it. By the look of how wait_on_bit_lock
is structured, the author probably did think about this case a little
better than I did :\
Thanks,
Nick
On Mon, 28 Aug 2017 11:16:48 +1000
Nicholas Piggin <[email protected]> wrote:
> On Sun, 27 Aug 2017 16:12:19 -0700
> Linus Torvalds <[email protected]> wrote:
>
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index baba290c276b..0b41c8cbeabc 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -986,10 +986,6 @@ static inline int
> > wait_on_page_bit_common(wait_queue_head_t *q,
> >
> > if (likely(test_bit(bit_nr, &page->flags))) {
> > io_schedule();
> > - if (unlikely(signal_pending_state(state, current))) {
> > - ret = -EINTR;
> > - break;
> > - }
> > }
> >
> > if (lock) {
> > @@ -999,6 +995,11 @@ static inline int
> > wait_on_page_bit_common(wait_queue_head_t *q,
> > if (!test_bit(bit_nr, &page->flags))
> > break;
> > }
> > +
> > + if (unlikely(signal_pending_state(state, current))) {
> > + ret = -EINTR;
> > + break;
> > + }
> > }
> >
> > finish_wait(q, wait);
> >
> > but maybe I'm missing something.
> >
> > Nick, comments?
>
> No I don't think you're missing something. We surely could lose our only
> wakeup in this window. So an exclusive waiter has to always make sure
> they propagate the wakeup (regardless of what they do with the contended
> resources itself).
>
> Seems like your fix should solve it. By the look of how wait_on_bit_lock
> is structured, the author probably did think about this case a little
> better than I did :\
BTW. since you are looking at this stuff, one other small problem I remember
with exclusive waiters is that losing to a concurrent locker puts them to
the back of the queue. I think that could be fixed with some small change to
the wait loops (first add to tail, then retries add to head). Thoughts?
Thanks,
Nick
On Sun, Aug 27, 2017 at 6:29 PM, Nicholas Piggin <[email protected]> wrote:
>
> BTW. since you are looking at this stuff, one other small problem I remember
> with exclusive waiters is that losing to a concurrent locker puts them to
> the back of the queue. I think that could be fixed with some small change to
> the wait loops (first add to tail, then retries add to head). Thoughts?
No, not that way.
First off, it's oddly complicated, but more importantly, the real
unfairness you lose to is not other things on the wait queue, but to
other lockers that aren't on the wait-queue at all, but instead just
come in and do a "test-and-set" without ever even going through the
slow path.
So instead of playing queuing games, you'd need to just change the
unlock sequence. Right now we basically do:
- clear lock bit and atomically test if contended (and we play games
with bit numbering to do that atomic test efficiently)
- if contended, wake things up
and you'd change the logic to be
- if contended, don't clear the lock bit at all, just transfer the
lock ownership directly to the waiters by walking the wait list
- clear the lock bit only once there are no more wait entries (either
because there were no waiters at all, or because all the entries were
just waiting for the lock to be released)
which is certainly doable with a couple of small extensions to the
page wait key data structure.
But most of my clever schemes the last few days were abject failures,
and honestly, it's late in the rc.
In fact, this late in the game I probably wouldn't even have committed
the small cleanups I did if it wasn't for the fact that thinking of
the whole WQ_FLAG_EXCLUSIVE bit made me find the bug.
So the cleanups were actually what got me to look at the problem in
the first place, and then I went "I'm going to commit the cleanup, and
then I can think about the bug I just found".
I'm just happy that the fix seems to be trivial. I was afraid I'd have
to do something nastier (like have the EINTR case send another
explicit wakeup to make up for the lost one, or some ugly hack like
that).
It was only when I started looking at the history of that code, and I
saw the old bit_lock code, and I went "Hmm. That has the _same_ bug -
oh wait, no it doesn't!" that I realized that there was that simple
fix.
You weren't cc'd on the earlier part of the discussion, you only got
added when I realized what the history and simple fix was.
Linus
On Sun, 27 Aug 2017 22:17:55 -0700
Linus Torvalds <[email protected]> wrote:
> On Sun, Aug 27, 2017 at 6:29 PM, Nicholas Piggin <[email protected]> wrote:
> >
> > BTW. since you are looking at this stuff, one other small problem I remember
> > with exclusive waiters is that losing to a concurrent locker puts them to
> > the back of the queue. I think that could be fixed with some small change to
> > the wait loops (first add to tail, then retries add to head). Thoughts?
>
> No, not that way.
>
> First off, it's oddly complicated, but more importantly, the real
> unfairness you lose to is not other things on the wait queue, but to
> other lockers that aren't on the wait-queue at all, but instead just
> come in and do a "test-and-set" without ever even going through the
> slow path.
Right, there is that unfairness *as well*. The requeue-to-tail logic
seems to make that worse and I thought it seemed like a simple way
to improve it.
>
> So instead of playing queuing games, you'd need to just change the
> unlock sequence. Right now we basically do:
>
> - clear lock bit and atomically test if contended (and we play games
> with bit numbering to do that atomic test efficiently)
>
> - if contended, wake things up
>
> and you'd change the logic to be
>
> - if contended, don't clear the lock bit at all, just transfer the
> lock ownership directly to the waiters by walking the wait list
>
> - clear the lock bit only once there are no more wait entries (either
> because there were no waiters at all, or because all the entries were
> just waiting for the lock to be released)
>
> which is certainly doable with a couple of small extensions to the
> page wait key data structure.
Yeah that would be ideal. Conceptually trivial, I guess care has to
be taken with transferring the memory ordering with the lock. Could
be a good concept to apply elsewhere too.
>
> But most of my clever schemes the last few days were abject failures,
> and honestly, it's late in the rc.
>
> In fact, this late in the game I probably wouldn't even have committed
> the small cleanups I did if it wasn't for the fact that thinking of
> the whole WQ_FLAG_EXCLUSIVE bit made me find the bug.
>
> So the cleanups were actually what got me to look at the problem in
> the first place, and then I went "I'm going to commit the cleanup, and
> then I can think about the bug I just found".
>
> I'm just happy that the fix seems to be trivial. I was afraid I'd have
> to do something nastier (like have the EINTR case send another
> explicit wakeup to make up for the lost one, or some ugly hack like
> that).
>
> It was only when I started looking at the history of that code, and I
> saw the old bit_lock code, and I went "Hmm. That has the _same_ bug -
> oh wait, no it doesn't!" that I realized that there was that simple
> fix.
>
> You weren't cc'd on the earlier part of the discussion, you only got
> added when I realized what the history and simple fix was.
You're right, no such improvement would be appropriate for 4.14.
Thanks,
Nick
.
> On Fri, Aug 25, 2017 at 7:54 PM, Linus Torvalds <torvalds@linux-
> foundation.org> wrote:
> >
> > Simplify, simplify, simplify.
>
> I've now tried three different approaches (I stopped sending broken patches
> after deciding the first one was unfixable), and they all suck.
>
> I was hoping for something lockless to avoid the whole issue with latency
> over the long list, but anything that has the wait queue entry allocated on the
> stack needs to synchronize the wakeup due to the stack usage, so once you
> have lists that are thousands of entries, either you hold the lock for long
> times (normal wait queues) or take and release them constantly (the swait
> approach), or you batch things up (Tim's wait-queue patches).
>
> Of those three approaches, the batching does seem to be the best of the lot.
> Allocating non-stack wait entries while waiting for pages seems like a bad
> idea. We're probably waiting for IO in normal circumstances, possibly
> because we're low on memory.
>
> So I *am* ok with the batching (your patch #1), but I'm *not* ok with the
> nasty horrible book-keeping to avoid new entries when you batch and
> release the lock and that allows new entries on the list (your patch #2).
>
> That said, I have now stared *way* too much at the page wait code after
> having unsuccessfully tried to replace that wait-queue with other "clever"
> approaches (all of which ended up being crap).
>
> And I'm starting to see a possible solution, or at least improvement.
>
> Let's just assume I take the batching patch. It's not conceptually ugly, it's
> fairly simple, and there are lots of independent arguments for it (ie latency,
> but also possible performance from better parallelism).
>
> That patch doesn't make any data structures bigger or more complicated, it's
> just that single "is this a bookmark entry" thing.
> So that patch just looks fundamentally fine to me, and the only real
> argument I ever had against it was that I would really really _really_ have
> wanted to root-cause the behavior.
>
> So that leaves my fundamental dislike of your other patch.
>
> And it turns out that all my looking at the page wait code wasn't entirely
> unproductive. Yes, I went through three crap approaches before I gave up on
> rewriting it, but in the meantime I did get way too intimate with looking at
> that pile of crud.
>
> And I think I found the reason why you needed that patch #2 in the first place.
>
> Here's what is going on:
>
> - we're going to assume that the problem is all with a single page, not due to
> hash collisions (as per your earlier reports)
>
> - we also know that the only bit that matters is the PG_locked bit
>
> - that means that the only way to get that "multiple concurrent waker"
> situation that your patch #2 tries to handle better is because you have
> multiple people unlocking the page - since that is what causes the wakeups
>
> - that in turn means that you obviously had multiple threads *locking* it too.
>
> - and that means that among those new entries there are lockers coming in
> in the middle and adding an exclusive entry.
>
> - the exclusive entry already stops the wakeup list walking
>
> - but we add non-exclusive entries TO THE BEGINNING of the page waiters
> list
>
> And I really think that last thing is fundamentally wrong.
>
> It's wrong for several reasons:
>
> - because it's unfair: threads that want to lock get put behind threads that
> just want to see the unlocked state.
>
> - because it's stupid: our non-locking waiters will end up waiting again if the
> page got locked, so waking up a locker *and* a lot of non-locking waiters will
> just cause them to go back to sleep anyway
>
> - because it causes us to walk longer lists: we stop walking when we wake up
> the exclusive waiter, but because we always put the non-exclusive waiters in
> there first, we always walk the long list of non-exclusive waiters even if we
> could just stop walking because we woke up an exclusive one.
>
> Now the reason we do this seems to be entirely random: for a *normal* wait
> queue, you really want to always wake up all the non-exclusive waiters,
> because exclusive waiters are only exclusive wrt each other.
>
> But for a page lock, an exclusive waiter really is getting the lock, and the
> other waiters are going to wait for the lock to clear anyway, so the page wait
> thing is actually almost exactly the reverse situation. We *could* put
> exclusive waiters at the beginning of the list instead, and actively try to avoid
> walking the list at all if we have pending lockers.
>
> I'm not doing that, because I think the fair thing to do is to just do things in
> the order they came in. Plus the code is actually simpler if we just always add
> to the tail.
>
> Now, the other thing to look at is how the wakeup function works. It checks
> the aliasing information (page and bit number), but then it
> *also* does:
>
> if (test_bit(key->bit_nr, &key->page->flags))
> return 0;
>
> basically saying "continue walking if somebody else already got the bit".
>
> That's *INSANE*. It's exactly the wrong thing to do. It's basically saying "even
> if we had an exclusive waiter, let's not wake it up, but do let us continue to
> walk the list even though the bit we're waiting for to clear is set already".
>
> What would be sane is to say "stop walking the list now".. So just do that - by
> making "negative return from wake function" mean "stop walking".
>
> So how about just this fairly trivial patch?
I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
But they don't fix the issue. I can still get the similar call stack.
Thanks,
Kan
>
> In fact, I think this may help even *without* Tim's patch #1. So I think this
> would be lovely to test on that problem load on its own, and seeing if it
> makes the wait queues behave better.
>
> It might not cut down on the total length of the wait-queue, but it should
> hopefully cause it to walk it much less. We now hit the exclusive waiters
> earlier and stop waking things up when we have a new locker thread pending.
> And when the page ends up being locked again, we stop walking the list
> entirely, so no unnecessarily traversal.
>
> This patch is small and at least minimally works (I'm running it right now).
>
> Linus
On Mon, Aug 28, 2017 at 7:51 AM, Liang, Kan <[email protected]> wrote:
>
> I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
> But they don't fix the issue. I can still get the similar call stack.
So the main issue was that I *really* hated Tim's patch #2, and the
patch to clean up the page wait queue should now make his patch series
much more palatable.
Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
always put the new entries at the end of the waitqueue.
The attached patches just apply directly on top of plain 4.13-rc7.
That makes patch #2 much more palatable, since it now doesn't need to
play games and worry about new arrivals.
But note the lack of testing. I've actually booted this and am running
these two patches right now, but honestly, you should consider them
"untested" simply because I can't trigger the page waiters contention
case to begin with.
But it's really just Tim's patches, modified for the page waitqueue
cleanup which makes patch #2 become much simpler, and now it's
palatable: it's just using the same bookmark thing that the normal
wakeup uses, no extra hacks.
So Tim should look these over, and they should definitely be tested on
that load-from-hell that you guys have, but if this set works, at
least I'm ok with it now.
Tim - did I miss anything? I added a "cpu_relax()" in there between
the release lock and irq and re-take it, I'm not convinced it makes
any difference, but I wanted to mark that "take a breather" thing.
Oh, there's one more case I only realized after the patches: the
stupid add_page_wait_queue() code still adds to the head of the list.
So technically you need this too:
diff --git a/mm/filemap.c b/mm/filemap.c
index 74123a298f53..598c3be57509 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1061,7 +1061,7 @@ void add_page_wait_queue(struct page *page,
wait_queue_entry_t *waiter)
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, waiter);
+ __add_wait_queue_entry_tail(q, waiter);
SetPageWaiters(page);
spin_unlock_irqrestore(&q->lock, flags);
}
but that only matters if you actually use the cachefiles thing, which
I hope/assume you don't.
Linus
On 08/28/2017 09:48 AM, Linus Torvalds wrote:
> On Mon, Aug 28, 2017 at 7:51 AM, Liang, Kan <[email protected]> wrote:
>>
>> I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
>> But they don't fix the issue. I can still get the similar call stack.
>
> So the main issue was that I *really* hated Tim's patch #2, and the
> patch to clean up the page wait queue should now make his patch series
> much more palatable.
>
> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
> always put the new entries at the end of the waitqueue.
>
> The attached patches just apply directly on top of plain 4.13-rc7.
>
> That makes patch #2 much more palatable, since it now doesn't need to
> play games and worry about new arrivals.
>
> But note the lack of testing. I've actually booted this and am running
> these two patches right now, but honestly, you should consider them
> "untested" simply because I can't trigger the page waiters contention
> case to begin with.
>
> But it's really just Tim's patches, modified for the page waitqueue
> cleanup which makes patch #2 become much simpler, and now it's
> palatable: it's just using the same bookmark thing that the normal
> wakeup uses, no extra hacks.
>
> So Tim should look these over, and they should definitely be tested on
> that load-from-hell that you guys have, but if this set works, at
> least I'm ok with it now.
>
> Tim - did I miss anything? I added a "cpu_relax()" in there between
> the release lock and irq and re-take it, I'm not convinced it makes
> any difference, but I wanted to mark that "take a breather" thing.
>
> Oh, there's one more case I only realized after the patches: the
> stupid add_page_wait_queue() code still adds to the head of the list.
> So technically you need this too:
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 74123a298f53..598c3be57509 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1061,7 +1061,7 @@ void add_page_wait_queue(struct page *page,
> wait_queue_entry_t *waiter)
> unsigned long flags;
>
> spin_lock_irqsave(&q->lock, flags);
> - __add_wait_queue(q, waiter);
> + __add_wait_queue_entry_tail(q, waiter);
I've also found this part of the code odd that add to head, but
wasn't sure about the history behind it to have changed it.
Adding to tail makes things much cleaner. I'm glad that
those ugly hacks that added flags and counter
to track new arrivals can be discarded.
The modified patchset looks fine to me. So pending Kan's test
on the new code I think we are good.
Thanks.
Tim
> SetPageWaiters(page);
> spin_unlock_irqrestore(&q->lock, flags);
> }
>
> but that only matters if you actually use the cachefiles thing, which
> I hope/assume you don't.
>
> Linus
>
On Tue, Aug 29, 2017 at 5:57 AM, Liang, Kan <[email protected]> wrote:
>>
>> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
>> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
>> always put the new entries at the end of the waitqueue.
>
> The patches fix the long wait issue.
>
> Tested-by: Kan Liang <[email protected]>
Ok. I'm not 100% comfortable applying them at rc7, so let me think
about it. There's only one known load triggering this, and by "known"
I mean "not really known" since we don't even know what the heck it
does outside of intel and whoever your customer is.
So I suspect I'll apply the patches next merge window, and we can
maybe mark them for stable if this actually ends up mattering.
Can you tell if the problem is actually hitting _production_ use or
was some kind of benchmark stress-test?
Linus
On 08/28/2017 09:48 AM, Linus Torvalds wrote:
> On Mon, Aug 28, 2017 at 7:51 AM, Liang, Kan <[email protected]> wrote:
>>
>> I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
>> But they don't fix the issue. I can still get the similar call stack.
>
> So the main issue was that I *really* hated Tim's patch #2, and the
> patch to clean up the page wait queue should now make his patch series
> much more palatable.
>
> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
> always put the new entries at the end of the waitqueue.
>
> The attached patches just apply directly on top of plain 4.13-rc7.
>
> That makes patch #2 much more palatable, since it now doesn't need to
> play games and worry about new arrivals.
>
> But note the lack of testing. I've actually booted this and am running
> these two patches right now, but honestly, you should consider them
> "untested" simply because I can't trigger the page waiters contention
> case to begin with.
>
> But it's really just Tim's patches, modified for the page waitqueue
> cleanup which makes patch #2 become much simpler, and now it's
> palatable: it's just using the same bookmark thing that the normal
> wakeup uses, no extra hacks.
>
> So Tim should look these over, and they should definitely be tested on
> that load-from-hell that you guys have, but if this set works, at
> least I'm ok with it now.
>
> Tim - did I miss anything? I added a "cpu_relax()" in there between
> the release lock and irq and re-take it, I'm not convinced it makes
> any difference, but I wanted to mark that "take a breather" thing.
>
> Oh, there's one more case I only realized after the patches: the
> stupid add_page_wait_queue() code still adds to the head of the list.
> So technically you need this too:
BTW, are you going to add the chunk below separately as part of your
wait queue cleanup patch?
Tim
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 74123a298f53..598c3be57509 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1061,7 +1061,7 @@ void add_page_wait_queue(struct page *page,
> wait_queue_entry_t *waiter)
> unsigned long flags;
>
> spin_lock_irqsave(&q->lock, flags);
> - __add_wait_queue(q, waiter);
> + __add_wait_queue_entry_tail(q, waiter);
> SetPageWaiters(page);
> spin_unlock_irqrestore(&q->lock, flags);
> }
>
> but that only matters if you actually use the cachefiles thing, which
> I hope/assume you don't.
>
> Linus
>
On 08/29/2017 09:01 AM, Linus Torvalds wrote:
> On Tue, Aug 29, 2017 at 5:57 AM, Liang, Kan <[email protected]> wrote:
>>>
>>> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
>>> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
>>> always put the new entries at the end of the waitqueue.
>>
>> The patches fix the long wait issue.
>>
>> Tested-by: Kan Liang <[email protected]>
>
> Ok. I'm not 100% comfortable applying them at rc7, so let me think
> about it. There's only one known load triggering this, and by "known"
> I mean "not really known" since we don't even know what the heck it
> does outside of intel and whoever your customer is.
>
> So I suspect I'll apply the patches next merge window, and we can
> maybe mark them for stable if this actually ends up mattering.
>
> Can you tell if the problem is actually hitting _production_ use or
> was some kind of benchmark stress-test?
>
>
It is affecting not a production use, but the customer's acceptance
test for their systems. So I suspect it is a stress test.
Tim
On Tue, Aug 29, 2017 at 9:13 AM, Tim Chen <[email protected]> wrote:
>
> It is affecting not a production use, but the customer's acceptance
> test for their systems. So I suspect it is a stress test.
Can you gently poke them and ask if they might make theie stress test
code available?
Tell them that we have a fix, but right now it's delayed into 4.14
because we have no visibility into what it is that it actually fixes,
and whether it's all that critical or just some microbenchmark.
Thanks,
Linus
> On Mon, Aug 28, 2017 at 7:51 AM, Liang, Kan <[email protected]> wrote:
> >
> > I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
> > But they don't fix the issue. I can still get the similar call stack.
>
> So the main issue was that I *really* hated Tim's patch #2, and the patch to
> clean up the page wait queue should now make his patch series much more
> palatable.
>
> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
> always put the new entries at the end of the waitqueue.
>
The patches fix the long wait issue.
Tested-by: Kan Liang <[email protected]>
> The attached patches just apply directly on top of plain 4.13-rc7.
>
> That makes patch #2 much more palatable, since it now doesn't need to play
> games and worry about new arrivals.
>
> But note the lack of testing. I've actually booted this and am running these
> two patches right now, but honestly, you should consider them "untested"
> simply because I can't trigger the page waiters contention case to begin with.
>
> But it's really just Tim's patches, modified for the page waitqueue cleanup
> which makes patch #2 become much simpler, and now it's
> palatable: it's just using the same bookmark thing that the normal wakeup
> uses, no extra hacks.
>
> So Tim should look these over, and they should definitely be tested on that
> load-from-hell that you guys have, but if this set works, at least I'm ok with it
> now.
>
> Tim - did I miss anything? I added a "cpu_relax()" in there between the
> release lock and irq and re-take it, I'm not convinced it makes any difference,
> but I wanted to mark that "take a breather" thing.
>
> Oh, there's one more case I only realized after the patches: the stupid
> add_page_wait_queue() code still adds to the head of the list.
> So technically you need this too:
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 74123a298f53..598c3be57509 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1061,7 +1061,7 @@ void add_page_wait_queue(struct page *page,
> wait_queue_entry_t *waiter)
> unsigned long flags;
>
> spin_lock_irqsave(&q->lock, flags);
> - __add_wait_queue(q, waiter);
> + __add_wait_queue_entry_tail(q, waiter);
> SetPageWaiters(page);
> spin_unlock_irqrestore(&q->lock, flags);
> }
>
> but that only matters if you actually use the cachefiles thing, which I
> hope/assume you don't.
>
> Linus
On Tue, Aug 29, 2017 at 9:17 AM, Tim Chen <[email protected]> wrote:
>
> BTW, are you going to add the chunk below separately as part of your
> wait queue cleanup patch?
I did.
Commit 9c3a815f471a ("page waitqueue: always add new entries at the end")
Linus
On 08/29/2017 09:24 AM, Linus Torvalds wrote:
> On Tue, Aug 29, 2017 at 9:13 AM, Tim Chen <[email protected]> wrote:
>>
>> It is affecting not a production use, but the customer's acceptance
>> test for their systems. So I suspect it is a stress test.
>
> Can you gently poke them and ask if they might make theie stress test
> code available?
>
> Tell them that we have a fix, but right now it's delayed into 4.14
> because we have no visibility into what it is that it actually fixes,
> and whether it's all that critical or just some microbenchmark.
>
Thanks. We'll do that.
Tim
On 08/29/2017 09:24 AM, Linus Torvalds wrote:
> On Tue, Aug 29, 2017 at 9:13 AM, Tim Chen <[email protected]> wrote:
>>
>> It is affecting not a production use, but the customer's acceptance
>> test for their systems. So I suspect it is a stress test.
>
> Can you gently poke them and ask if they might make theie stress test
> code available?
>
> Tell them that we have a fix, but right now it's delayed into 4.14
> because we have no visibility into what it is that it actually fixes,
> and whether it's all that critical or just some microbenchmark.
>
>
Linus,
Here's what the customer think happened and is willing to tell us.
They have a parent process that spawns off 10 children per core and
kicked them to run. The child processes all access a common library.
We have 384 cores so 3840 child processes running. When migration occur on
a page in the common library, the first child that access the page will
page fault and lock the page, with the other children also page faulting
quickly and pile up in the page wait list, till the first child is done.
Probably some kind of access pattern of the common library induces the
page migration to happen.
BTW, will you be merging these 2 patches in 4.14?
Thanks.
Tim
On Wed, Sep 13, 2017 at 7:12 PM, Tim Chen <[email protected]> wrote:
>
> BTW, will you be merging these 2 patches in 4.14?
Yes, and thanks for reminding me.
In fact, would you mind sending me the latest versions, rather than me
digging them out of the disaster area that is my mailbox and possibly
picking an older version?
Linus
On Wed, 13 Sep 2017, Tim Chen wrote:
> Here's what the customer think happened and is willing to tell us.
> They have a parent process that spawns off 10 children per core and
> kicked them to run. The child processes all access a common library.
> We have 384 cores so 3840 child processes running. When migration occur on
> a page in the common library, the first child that access the page will
> page fault and lock the page, with the other children also page faulting
> quickly and pile up in the page wait list, till the first child is done.
I think we need some way to avoid migration in cases like this. This is
crazy. Page migration was not written to deal with something like this.
On 09/13/2017 07:27 PM, Linus Torvalds wrote:
> On Wed, Sep 13, 2017 at 7:12 PM, Tim Chen <[email protected]> wrote:
>>
>> BTW, will you be merging these 2 patches in 4.14?
>
> Yes, and thanks for reminding me.
>
> In fact, would you mind sending me the latest versions, rather than me
> digging them out of the disaster area that is my mailbox and possibly
> picking an older version?
>
> Linus
>
Attached the two patches that you have updated to sync with your other
page wait queue clean up and sent to Kan and me:
https://marc.info/?l=linux-kernel&m=150393893927105&w=2
Kan tested this before so it should be still good.
I checked that it applied cleanly on latest master.
Thanks.
Tim
On Thu, Sep 14, 2017 at 9:50 AM, Tim Chen <[email protected]> wrote:
>
> Kan tested this before so it should be still good.
> I checked that it applied cleanly on latest master.
Thanks, applied.
I really hope we end up fixing the migration thing too, but at least
4.14 will have the mitigation for the long wait queues.
Linus