2016-12-25 03:00:44

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH 0/2] PageWaiters again

I cleaned up the changelog a bit and made a few tweaks to patch 1 as
described in my reply to Hugh. Resending with SOBs.

Thanks,
Nick

Nicholas Piggin (2):
mm: Use owner_priv bit for PageSwapCache, valid when PageSwapBacked
mm: add PageWaiters indicating tasks are waiting for a page bit

include/linux/mm.h | 2 +
include/linux/page-flags.h | 33 ++++++--
include/linux/pagemap.h | 23 +++---
include/linux/writeback.h | 1 -
include/trace/events/mmflags.h | 2 +-
init/main.c | 3 +-
mm/filemap.c | 181 +++++++++++++++++++++++++++++++++--------
mm/internal.h | 2 +
mm/memory-failure.c | 4 +-
mm/migrate.c | 14 ++--
mm/swap.c | 2 +
11 files changed, 199 insertions(+), 68 deletions(-)

--
2.11.0


2016-12-25 03:00:53

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH 1/2] mm: Use owner_priv bit for PageSwapCache, valid when PageSwapBacked

A page is not added to the swap cache without being swap backed,
so PageSwapBacked mappings can use PG_owner_priv_1 for PageSwapCache.

Acked-by: Hugh Dickins <[email protected]>
Signed-off-by: Nicholas Piggin <[email protected]>
---
include/linux/page-flags.h | 24 ++++++++++++++++--------
include/trace/events/mmflags.h | 1 -
mm/memory-failure.c | 4 +---
mm/migrate.c | 14 ++++++++------
4 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 74e4dda91238..a57c909a15e4 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -87,7 +87,6 @@ enum pageflags {
PG_private_2, /* If pagecache, has fs aux data */
PG_writeback, /* Page is under writeback */
PG_head, /* A head page */
- PG_swapcache, /* Swap page: swp_entry_t in private */
PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
PG_swapbacked, /* Page is backed by RAM/swap */
@@ -110,6 +109,9 @@ enum pageflags {
/* Filesystems */
PG_checked = PG_owner_priv_1,

+ /* SwapBacked */
+ PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
+
/* Two page bits are conscripted by FS-Cache to maintain local caching
* state. These bits are set on pages belonging to the netfs's inodes
* when those inodes are being locally cached.
@@ -314,7 +316,13 @@ PAGEFLAG_FALSE(HighMem)
#endif

#ifdef CONFIG_SWAP
-PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
+static __always_inline int PageSwapCache(struct page *page)
+{
+ return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags);
+
+}
+SETPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
+CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(SwapCache)
#endif
@@ -701,12 +709,12 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
* Flags checked when a page is freed. Pages being freed should not have
* these flags set. It they are, there is a problem.
*/
-#define PAGE_FLAGS_CHECK_AT_FREE \
- (1UL << PG_lru | 1UL << PG_locked | \
- 1UL << PG_private | 1UL << PG_private_2 | \
- 1UL << PG_writeback | 1UL << PG_reserved | \
- 1UL << PG_slab | 1UL << PG_swapcache | 1UL << PG_active | \
- 1UL << PG_unevictable | __PG_MLOCKED)
+#define PAGE_FLAGS_CHECK_AT_FREE \
+ (1UL << PG_lru | 1UL << PG_locked | \
+ 1UL << PG_private | 1UL << PG_private_2 | \
+ 1UL << PG_writeback | 1UL << PG_reserved | \
+ 1UL << PG_slab | 1UL << PG_active | \
+ 1UL << PG_unevictable | __PG_MLOCKED)

/*
* Flags checked when a page is prepped for return by the page allocator.
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 5a81ab48a2fb..30c2adbdebe8 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -95,7 +95,6 @@
{1UL << PG_private_2, "private_2" }, \
{1UL << PG_writeback, "writeback" }, \
{1UL << PG_head, "head" }, \
- {1UL << PG_swapcache, "swapcache" }, \
{1UL << PG_mappedtodisk, "mappedtodisk" }, \
{1UL << PG_reclaim, "reclaim" }, \
{1UL << PG_swapbacked, "swapbacked" }, \
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 19e796d36a62..f283c7e0a2a3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -764,12 +764,11 @@ static int me_huge_page(struct page *p, unsigned long pfn)
*/

#define dirty (1UL << PG_dirty)
-#define sc (1UL << PG_swapcache)
+#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
#define unevict (1UL << PG_unevictable)
#define mlock (1UL << PG_mlocked)
#define writeback (1UL << PG_writeback)
#define lru (1UL << PG_lru)
-#define swapbacked (1UL << PG_swapbacked)
#define head (1UL << PG_head)
#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
@@ -819,7 +818,6 @@ static struct page_state {
#undef mlock
#undef writeback
#undef lru
-#undef swapbacked
#undef head
#undef slab
#undef reserved
diff --git a/mm/migrate.c b/mm/migrate.c
index 0ed24b1fa77b..87f4d0f81819 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -466,13 +466,15 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
newpage->index = page->index;
newpage->mapping = page->mapping;
- if (PageSwapBacked(page))
- __SetPageSwapBacked(newpage);
-
get_page(newpage); /* add cache reference */
- if (PageSwapCache(page)) {
- SetPageSwapCache(newpage);
- set_page_private(newpage, page_private(page));
+ if (PageSwapBacked(page)) {
+ __SetPageSwapBacked(newpage);
+ if (PageSwapCache(page)) {
+ SetPageSwapCache(newpage);
+ set_page_private(newpage, page_private(page));
+ }
+ } else {
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
}

/* Move dirty while page refs frozen and newpage not yet exposed */
--
2.11.0

2016-12-25 03:00:58

by Nicholas Piggin

[permalink] [raw]
Subject: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

Add a new page flag, PageWaiters, to indicate the page waitqueue has
tasks waiting. This can be tested rather than testing waitqueue_active
which requires another cacheline load.

This bit is always set when the page has tasks on page_waitqueue(page),
and is set and cleared under the waitqueue lock. It may be set when
there are no tasks on the waitqueue, which will cause a harmless extra
wakeup check that will clears the bit.

The generic bit-waitqueue infrastructure is no longer used for pages.
Instead, waitqueues are used directly with a custom key type. The
generic code was not flexible enough to have PageWaiters manipulation
under the waitqueue lock (which simplifies concurrency).

This improves the performance of page lock intensive microbenchmarks by
2-3%.

Putting two bits in the same word opens the opportunity to remove the
memory barrier between clearing the lock bit and testing the waiters
bit, after some work on the arch primitives (e.g., ensuring memory
operand widths match and cover both bits).

Signed-off-by: Nicholas Piggin <[email protected]>
---
include/linux/mm.h | 2 +
include/linux/page-flags.h | 9 ++
include/linux/pagemap.h | 23 +++---
include/linux/writeback.h | 1 -
include/trace/events/mmflags.h | 1 +
init/main.c | 3 +-
mm/filemap.c | 181 +++++++++++++++++++++++++++++++++--------
mm/internal.h | 2 +
mm/swap.c | 2 +
9 files changed, 174 insertions(+), 50 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4424784ac374..fe6b4036664a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1758,6 +1758,8 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
return ptl;
}

+extern void __init pagecache_init(void);
+
extern void free_area_init(unsigned long * zones_size);
extern void free_area_init_node(int nid, unsigned long * zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a57c909a15e4..c56b39890a41 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -73,6 +73,7 @@
*/
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
+ PG_waiters, /* Page has waiters, check its waitqueue */
PG_error,
PG_referenced,
PG_uptodate,
@@ -169,6 +170,9 @@ static __always_inline int PageCompound(struct page *page)
* for compound page all operations related to the page flag applied to
* head page.
*
+ * PF_ONLY_HEAD:
+ * for compound page, callers only ever operate on the head page.
+ *
* PF_NO_TAIL:
* modifications of the page flag must be done on small or head pages,
* checks can be done on tail pages too.
@@ -178,6 +182,9 @@ static __always_inline int PageCompound(struct page *page)
*/
#define PF_ANY(page, enforce) page
#define PF_HEAD(page, enforce) compound_head(page)
+#define PF_ONLY_HEAD(page, enforce) ({ \
+ VM_BUG_ON_PGFLAGS(PageTail(page), page); \
+ page;})
#define PF_NO_TAIL(page, enforce) ({ \
VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \
compound_head(page);})
@@ -255,6 +262,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)

__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
PAGEFLAG(Referenced, referenced, PF_HEAD)
TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
@@ -743,6 +751,7 @@ static inline int page_has_private(struct page *page)

#undef PF_ANY
#undef PF_HEAD
+#undef PF_ONLY_HEAD
#undef PF_NO_TAIL
#undef PF_NO_COMPOUND
#endif /* !__GENERATING_BOUNDS_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7dbe9148b2f8..d7f25f754d60 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -486,22 +486,14 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
* and for filesystems which need to wait on PG_private.
*/
extern void wait_on_page_bit(struct page *page, int bit_nr);
-
extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
-extern int wait_on_page_bit_killable_timeout(struct page *page,
- int bit_nr, unsigned long timeout);
-
-static inline int wait_on_page_locked_killable(struct page *page)
-{
- if (!PageLocked(page))
- return 0;
- return wait_on_page_bit_killable(compound_head(page), PG_locked);
-}
+extern void wake_up_page_bit(struct page *page, int bit_nr);

-extern wait_queue_head_t *page_waitqueue(struct page *page);
static inline void wake_up_page(struct page *page, int bit)
{
- __wake_up_bit(page_waitqueue(page), &page->flags, bit);
+ if (!PageWaiters(page))
+ return;
+ wake_up_page_bit(page, bit);
}

/*
@@ -517,6 +509,13 @@ static inline void wait_on_page_locked(struct page *page)
wait_on_page_bit(compound_head(page), PG_locked);
}

+static inline int wait_on_page_locked_killable(struct page *page)
+{
+ if (!PageLocked(page))
+ return 0;
+ return wait_on_page_bit_killable(compound_head(page), PG_locked);
+}
+
/*
* Wait for a page to complete writeback
*/
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index c78f9f0920b5..5527d910ba3d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -375,7 +375,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);

void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
-void page_writeback_init(void);
void balance_dirty_pages_ratelimited(struct address_space *mapping);
bool wb_over_bg_thresh(struct bdi_writeback *wb);

diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 30c2adbdebe8..9e687ca9a307 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -81,6 +81,7 @@

#define __def_pageflag_names \
{1UL << PG_locked, "locked" }, \
+ {1UL << PG_waiters, "waiters" }, \
{1UL << PG_error, "error" }, \
{1UL << PG_referenced, "referenced" }, \
{1UL << PG_uptodate, "uptodate" }, \
diff --git a/init/main.c b/init/main.c
index c81c9fa21bc7..b0c9d6facef9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -647,9 +647,8 @@ asmlinkage __visible void __init start_kernel(void)
security_init();
dbg_late_init();
vfs_caches_init();
+ pagecache_init();
signals_init();
- /* rootfs populating might need page-writeback */
- page_writeback_init();
proc_root_init();
nsfs_init();
cpuset_init();
diff --git a/mm/filemap.c b/mm/filemap.c
index 32be3c8f3a11..82f26cde830c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -739,45 +739,159 @@ EXPORT_SYMBOL(__page_cache_alloc);
* at a cost of "thundering herd" phenomena during rare hash
* collisions.
*/
-wait_queue_head_t *page_waitqueue(struct page *page)
+#define PAGE_WAIT_TABLE_BITS 8
+#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
+static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+
+static wait_queue_head_t *page_waitqueue(struct page *page)
{
- return bit_waitqueue(page, 0);
+ return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
}
-EXPORT_SYMBOL(page_waitqueue);

-void wait_on_page_bit(struct page *page, int bit_nr)
+void __init pagecache_init(void)
{
- DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+ int i;

- if (test_bit(bit_nr, &page->flags))
- __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io,
- TASK_UNINTERRUPTIBLE);
+ for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(&page_wait_table[i]);
+
+ page_writeback_init();
}
-EXPORT_SYMBOL(wait_on_page_bit);

-int wait_on_page_bit_killable(struct page *page, int bit_nr)
+struct wait_page_key {
+ struct page *page;
+ int bit_nr;
+ int page_match;
+};
+
+struct wait_page_queue {
+ struct page *page;
+ int bit_nr;
+ wait_queue_t wait;
+};
+
+static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
{
- DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+ struct wait_page_key *key = arg;
+ struct wait_page_queue *wait_page
+ = container_of(wait, struct wait_page_queue, wait);
+
+ if (wait_page->page != key->page)
+ return 0;
+ key->page_match = 1;

- if (!test_bit(bit_nr, &page->flags))
+ if (wait_page->bit_nr != key->bit_nr)
+ return 0;
+ if (test_bit(key->bit_nr, &key->page->flags))
return 0;

- return __wait_on_bit(page_waitqueue(page), &wait,
- bit_wait_io, TASK_KILLABLE);
+ return autoremove_wake_function(wait, mode, sync, key);
}

-int wait_on_page_bit_killable_timeout(struct page *page,
- int bit_nr, unsigned long timeout)
+void wake_up_page_bit(struct page *page, int bit_nr)
{
- DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+ wait_queue_head_t *q = page_waitqueue(page);
+ struct wait_page_key key;
+ unsigned long flags;

- wait.key.timeout = jiffies + timeout;
- if (!test_bit(bit_nr, &page->flags))
- return 0;
- return __wait_on_bit(page_waitqueue(page), &wait,
- bit_wait_io_timeout, TASK_KILLABLE);
+ key.page = page;
+ key.bit_nr = bit_nr;
+ key.page_match = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_locked_key(q, TASK_NORMAL, &key);
+ /*
+ * It is possible for other pages to have collided on the waitqueue
+ * hash, so in that case check for a page match. That prevents a long-
+ * term waiter
+ *
+ * It is still possible to miss a case here, when we woke page waiters
+ * and removed them from the waitqueue, but there are still other
+ * page waiters.
+ */
+ if (!waitqueue_active(q) || !key.page_match) {
+ ClearPageWaiters(page);
+ /*
+ * It's possible to miss clearing Waiters here, when we woke
+ * our page waiters, but the hashed waitqueue has waiters for
+ * other pages on it.
+ *
+ * That's okay, it's a rare case. The next waker will clear it.
+ */
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(wake_up_page_bit);
+
+static inline int wait_on_page_bit_common(wait_queue_head_t *q,
+ struct page *page, int bit_nr, int state, bool lock)
+{
+ struct wait_page_queue wait_page;
+ wait_queue_t *wait = &wait_page.wait;
+ int ret = 0;
+
+ init_wait(wait);
+ wait->func = wake_page_function;
+ wait_page.page = page;
+ wait_page.bit_nr = bit_nr;
+
+ for (;;) {
+ spin_lock_irq(&q->lock);
+
+ if (likely(list_empty(&wait->task_list))) {
+ if (lock)
+ __add_wait_queue_tail_exclusive(q, wait);
+ else
+ __add_wait_queue(q, wait);
+ SetPageWaiters(page);
+ }
+
+ set_current_state(state);
+
+ spin_unlock_irq(&q->lock);
+
+ if (likely(test_bit(bit_nr, &page->flags))) {
+ io_schedule();
+ if (unlikely(signal_pending_state(state, current))) {
+ ret = -EINTR;
+ break;
+ }
+ }
+
+ if (lock) {
+ if (!test_and_set_bit_lock(bit_nr, &page->flags))
+ break;
+ } else {
+ if (!test_bit(bit_nr, &page->flags))
+ break;
+ }
+ }
+
+ finish_wait(q, wait);
+
+ /*
+ * A signal could leave PageWaiters set. Clearing it here if
+ * !waitqueue_active would be possible (by open-coding finish_wait),
+ * but still fail to catch it in the case of wait hash collision. We
+ * already can fail to clear wait hash collision cases, so don't
+ * bother with signals either.
+ */
+
+ return ret;
+}
+
+void wait_on_page_bit(struct page *page, int bit_nr)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+}
+EXPORT_SYMBOL(wait_on_page_bit);
+
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
}
-EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout);

/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
@@ -793,6 +907,7 @@ void add_page_wait_queue(struct page *page, wait_queue_t *waiter)

spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, waiter);
+ SetPageWaiters(page);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL_GPL(add_page_wait_queue);
@@ -874,23 +989,19 @@ EXPORT_SYMBOL_GPL(page_endio);
* __lock_page - get a lock on the page, assuming we need to sleep to get it
* @page: the page to lock
*/
-void __lock_page(struct page *page)
+void __lock_page(struct page *__page)
{
- struct page *page_head = compound_head(page);
- DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
- __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
- TASK_UNINTERRUPTIBLE);
+ struct page *page = compound_head(__page);
+ wait_queue_head_t *q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
}
EXPORT_SYMBOL(__lock_page);

-int __lock_page_killable(struct page *page)
+int __lock_page_killable(struct page *__page)
{
- struct page *page_head = compound_head(page);
- DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
- return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
- bit_wait_io, TASK_KILLABLE);
+ struct page *page = compound_head(__page);
+ wait_queue_head_t *q = page_waitqueue(page);
+ return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);

diff --git a/mm/internal.h b/mm/internal.h
index 44d68895a9b9..7aa2ea0a8623 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,6 +36,8 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)

+void page_writeback_init(void);
+
int do_swap_page(struct vm_fault *vmf);

void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
diff --git a/mm/swap.c b/mm/swap.c
index 4dcf852e1e6d..844baedd2429 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -69,6 +69,7 @@ static void __page_cache_release(struct page *page)
del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
}
+ __ClearPageWaiters(page);
mem_cgroup_uncharge(page);
}

@@ -784,6 +785,7 @@ void release_pages(struct page **pages, int nr, bool cold)

/* Clear Active bit in case of parallel mark_page_accessed */
__ClearPageActive(page);
+ __ClearPageWaiters(page);

list_add(&page->lru, &pages_to_free);
}
--
2.11.0

2016-12-25 05:14:00

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: Use owner_priv bit for PageSwapCache, valid when PageSwapBacked

On Sun, 25 Dec 2016, Nicholas Piggin wrote:

> A page is not added to the swap cache without being swap backed,
> so PageSwapBacked mappings can use PG_owner_priv_1 for PageSwapCache.
>
> Acked-by: Hugh Dickins <[email protected]>

Yes, confirmed, Acked-by: Hugh Dickins <[email protected]>
I checked through your migrate and memory-failure additions,
and both look correct to me. I still think that more should
be done for KPF_SWAPCACHE, to exclude the new false positives;
but as I said before, no urgency, that can be a later followup.

> Signed-off-by: Nicholas Piggin <[email protected]>
> ---
> include/linux/page-flags.h | 24 ++++++++++++++++--------
> include/trace/events/mmflags.h | 1 -
> mm/memory-failure.c | 4 +---
> mm/migrate.c | 14 ++++++++------
> 4 files changed, 25 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index 74e4dda91238..a57c909a15e4 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -87,7 +87,6 @@ enum pageflags {
> PG_private_2, /* If pagecache, has fs aux data */
> PG_writeback, /* Page is under writeback */
> PG_head, /* A head page */
> - PG_swapcache, /* Swap page: swp_entry_t in private */
> PG_mappedtodisk, /* Has blocks allocated on-disk */
> PG_reclaim, /* To be reclaimed asap */
> PG_swapbacked, /* Page is backed by RAM/swap */
> @@ -110,6 +109,9 @@ enum pageflags {
> /* Filesystems */
> PG_checked = PG_owner_priv_1,
>
> + /* SwapBacked */
> + PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
> +
> /* Two page bits are conscripted by FS-Cache to maintain local caching
> * state. These bits are set on pages belonging to the netfs's inodes
> * when those inodes are being locally cached.
> @@ -314,7 +316,13 @@ PAGEFLAG_FALSE(HighMem)
> #endif
>
> #ifdef CONFIG_SWAP
> -PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
> +static __always_inline int PageSwapCache(struct page *page)
> +{
> + return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags);
> +
> +}
> +SETPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
> +CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
> #else
> PAGEFLAG_FALSE(SwapCache)
> #endif
> @@ -701,12 +709,12 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
> * Flags checked when a page is freed. Pages being freed should not have
> * these flags set. It they are, there is a problem.
> */
> -#define PAGE_FLAGS_CHECK_AT_FREE \
> - (1UL << PG_lru | 1UL << PG_locked | \
> - 1UL << PG_private | 1UL << PG_private_2 | \
> - 1UL << PG_writeback | 1UL << PG_reserved | \
> - 1UL << PG_slab | 1UL << PG_swapcache | 1UL << PG_active | \
> - 1UL << PG_unevictable | __PG_MLOCKED)
> +#define PAGE_FLAGS_CHECK_AT_FREE \
> + (1UL << PG_lru | 1UL << PG_locked | \
> + 1UL << PG_private | 1UL << PG_private_2 | \
> + 1UL << PG_writeback | 1UL << PG_reserved | \
> + 1UL << PG_slab | 1UL << PG_active | \
> + 1UL << PG_unevictable | __PG_MLOCKED)
>
> /*
> * Flags checked when a page is prepped for return by the page allocator.
> diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
> index 5a81ab48a2fb..30c2adbdebe8 100644
> --- a/include/trace/events/mmflags.h
> +++ b/include/trace/events/mmflags.h
> @@ -95,7 +95,6 @@
> {1UL << PG_private_2, "private_2" }, \
> {1UL << PG_writeback, "writeback" }, \
> {1UL << PG_head, "head" }, \
> - {1UL << PG_swapcache, "swapcache" }, \
> {1UL << PG_mappedtodisk, "mappedtodisk" }, \
> {1UL << PG_reclaim, "reclaim" }, \
> {1UL << PG_swapbacked, "swapbacked" }, \
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 19e796d36a62..f283c7e0a2a3 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -764,12 +764,11 @@ static int me_huge_page(struct page *p, unsigned long pfn)
> */
>
> #define dirty (1UL << PG_dirty)
> -#define sc (1UL << PG_swapcache)
> +#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
> #define unevict (1UL << PG_unevictable)
> #define mlock (1UL << PG_mlocked)
> #define writeback (1UL << PG_writeback)
> #define lru (1UL << PG_lru)
> -#define swapbacked (1UL << PG_swapbacked)
> #define head (1UL << PG_head)
> #define slab (1UL << PG_slab)
> #define reserved (1UL << PG_reserved)
> @@ -819,7 +818,6 @@ static struct page_state {
> #undef mlock
> #undef writeback
> #undef lru
> -#undef swapbacked
> #undef head
> #undef slab
> #undef reserved
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 0ed24b1fa77b..87f4d0f81819 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -466,13 +466,15 @@ int migrate_page_move_mapping(struct address_space *mapping,
> */
> newpage->index = page->index;
> newpage->mapping = page->mapping;
> - if (PageSwapBacked(page))
> - __SetPageSwapBacked(newpage);
> -
> get_page(newpage); /* add cache reference */
> - if (PageSwapCache(page)) {
> - SetPageSwapCache(newpage);
> - set_page_private(newpage, page_private(page));
> + if (PageSwapBacked(page)) {
> + __SetPageSwapBacked(newpage);
> + if (PageSwapCache(page)) {
> + SetPageSwapCache(newpage);
> + set_page_private(newpage, page_private(page));
> + }
> + } else {
> + VM_BUG_ON_PAGE(PageSwapCache(page), page);
> }
>
> /* Move dirty while page refs frozen and newpage not yet exposed */
> --
> 2.11.0

2016-12-25 21:51:20

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Sat, Dec 24, 2016 at 7:00 PM, Nicholas Piggin <[email protected]> wrote:
> Add a new page flag, PageWaiters, to indicate the page waitqueue has
> tasks waiting. This can be tested rather than testing waitqueue_active
> which requires another cacheline load.

Ok, I applied this one too. I think there's room for improvement, but
I don't think it's going to help to just wait another release cycle
and hope something happens.

Example room for improvement from a profile of unlock_page():

46.44 │ lock andb $0xfe,(%rdi)
34.22 │ mov (%rdi),%rax

this has the old "do atomic op on a byte, then load the whole word"
issue that we used to have with the nasty zone lookup code too. And it
causes a horrible pipeline hickup because the load will not forward
the data from the (partial) store.

Its' really a misfeature of our asm optimizations of the atomic bit
ops. Using "andb" is slightly smaller, but in this case in particular,
an "andq" would be a ton faster, and the mask still fits in an imm8,
so it's not even hugely larger.

But it might also be a good idea to simply use a "cmpxchg" loop here.
That also gives atomicity guarantees that we don't have with the
"clear bit and then load the value".

Regardless, I think this is worth more people looking at and testing.
And merging it is probably the best way for that to happen.

Linus

2016-12-26 01:17:15

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Sun, 25 Dec 2016 13:51:17 -0800
Linus Torvalds <[email protected]> wrote:

> On Sat, Dec 24, 2016 at 7:00 PM, Nicholas Piggin <[email protected]> wrote:
> > Add a new page flag, PageWaiters, to indicate the page waitqueue has
> > tasks waiting. This can be tested rather than testing waitqueue_active
> > which requires another cacheline load.
>
> Ok, I applied this one too. I think there's room for improvement, but
> I don't think it's going to help to just wait another release cycle
> and hope something happens.
>
> Example room for improvement from a profile of unlock_page():
>
> 46.44 │ lock andb $0xfe,(%rdi)
> 34.22 │ mov (%rdi),%rax
>
> this has the old "do atomic op on a byte, then load the whole word"
> issue that we used to have with the nasty zone lookup code too. And it
> causes a horrible pipeline hickup because the load will not forward
> the data from the (partial) store.
>
> Its' really a misfeature of our asm optimizations of the atomic bit
> ops. Using "andb" is slightly smaller, but in this case in particular,
> an "andq" would be a ton faster, and the mask still fits in an imm8,
> so it's not even hugely larger.

I did actually play around with that. I could not get my skylake
to forward the result from a lock op to a subsequent load (the
latency was the same whether you use lock ; andb or lock ; andl
(32 cycles for my test loop) whereas with non-atomic versions I
was getting about 15 cycles for andb vs 2 for andl.

I guess the lock op drains the store queue to coherency and does
not allow forwarding so as to provide the memory ordering
semantics.

> But it might also be a good idea to simply use a "cmpxchg" loop here.
> That also gives atomicity guarantees that we don't have with the
> "clear bit and then load the value".

cmpxchg ends up at 19 cycles including the initial load, so it
may be worthwhile. Powerpc has a similar problem with doing a
clear_bit; test_bit (not the size mismatch, but forwarding from
atomic ops being less capable).

Thanks,
Nick

2016-12-26 19:07:55

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Sun, Dec 25, 2016 at 5:16 PM, Nicholas Piggin <[email protected]> wrote:
>
> I did actually play around with that. I could not get my skylake
> to forward the result from a lock op to a subsequent load (the
> latency was the same whether you use lock ; andb or lock ; andl
> (32 cycles for my test loop) whereas with non-atomic versions I
> was getting about 15 cycles for andb vs 2 for andl.

Yes, interesting. It does look like the locked ops don't end up having
the partial write issue and the size of the op doesn't matter.

But it's definitely the case that the write buffer hit immediately
after the atomic read-modify-write ends up slowing things down, so the
profile oddity isn't just a profile artifact. I wrote a stupid test
program that did an atomic increment, and then read either the same
value, or an adjacent value in memory (so same instruvtion sequence,
the difference just being what memory location the read accessed).

Reading the same value after the atomic update was *much* more
expensive than reading the adjacent value, so it causes some kind of
pipeline hickup (by about 50% of the cost of the atomic op itself:
iow, the "atomic-op followed by read same location" was over 1.5x
slower than "atomic op followed by read of another location").

So the atomic ops don't serialize things entirely, but they *hate*
having the value read (regardless of size) right after being updated,
because it causes some kind of nasty pipeline issue.

A cmpxchg does seem to avoid the issue.

Linus

2016-12-27 11:28:23

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Mon, 26 Dec 2016 11:07:52 -0800
Linus Torvalds <[email protected]> wrote:

> On Sun, Dec 25, 2016 at 5:16 PM, Nicholas Piggin <[email protected]> wrote:
> >
> > I did actually play around with that. I could not get my skylake
> > to forward the result from a lock op to a subsequent load (the
> > latency was the same whether you use lock ; andb or lock ; andl
> > (32 cycles for my test loop) whereas with non-atomic versions I
> > was getting about 15 cycles for andb vs 2 for andl.
>
> Yes, interesting. It does look like the locked ops don't end up having
> the partial write issue and the size of the op doesn't matter.
>
> But it's definitely the case that the write buffer hit immediately
> after the atomic read-modify-write ends up slowing things down, so the
> profile oddity isn't just a profile artifact. I wrote a stupid test
> program that did an atomic increment, and then read either the same
> value, or an adjacent value in memory (so same instruvtion sequence,
> the difference just being what memory location the read accessed).
>
> Reading the same value after the atomic update was *much* more
> expensive than reading the adjacent value, so it causes some kind of
> pipeline hickup (by about 50% of the cost of the atomic op itself:
> iow, the "atomic-op followed by read same location" was over 1.5x
> slower than "atomic op followed by read of another location").
>
> So the atomic ops don't serialize things entirely, but they *hate*
> having the value read (regardless of size) right after being updated,
> because it causes some kind of nasty pipeline issue.

Sure, I would expect independent operations to be able to run ahead
of the atomic op, and this might point to speculation of consistency
for loads -- an independent younger load can be executed speculatively
before the atomic op and flushed if the cacheline was lost before the
load is completed in order.

I bet forwarding from the store queue in case of a locked op is more
difficult. I guess it could be done in the same way, but the load hits
the store queue ahead of the cache then it's more work to then have
the load go to the cache so it can find the line to speculate on while
the flush is in progress. Common case of load hit non-atomic store
would not require this case so it may just not be worthwhile.

Anyway that's speculation (ha). What matters is we know the load is
nasty.

>
> A cmpxchg does seem to avoid the issue.

Yes, I wonder what to do. POWER CPUs have very similar issues and we
have noticed unlock_page and several other cases where atomic ops cause
load stalls. With its ll/sc, POWER would prefer not to do a cmpxchg.

Attached is part of a patch I've been mulling over for a while. I
expect you to hate it, and it does not solve this problem for x86,
but I like being able to propagate values from atomic ops back
to the compiler. Of course, volatile then can't be used either which
is another spanner...

Short term option is to just have a specific primitive for
clear-unlock-and-test, which we kind of need anyway here to avoid the
memory barrier in an arch-independent way.

Thanks,
Nick

---

After removing the smp_mb__after_atomic and volatile from test_bit,
applying this directive to atomic primitives results in test_bit able
to recognise if the value is in a register. unlock_page improves:

lwsync
ldarx r10,0,r3
andc r10,r10,r9
stdcx. r10,0,r3
bne- 99c <unlock_page+0x5c>
- ld r9,0(r3)
- andi. r10,r9,2
+ andi. r10,r10,2
beqlr
b 97c <unlock_page+0x3c>
---
arch/powerpc/include/asm/bitops.h | 2 ++
arch/powerpc/include/asm/local.h | 12 ++++++++++++
include/asm-generic/bitops/non-atomic.h | 2 +-
include/linux/compiler.h | 19 +++++++++++++++++++
mm/filemap.c | 2 +-
5 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 59abc620f8e8..0c3e0c384b7d 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -70,6 +70,7 @@ static __inline__ void fn(unsigned long mask, \
: "=&r" (old), "+m" (*p) \
: "r" (mask), "r" (p) \
: "cc", "memory"); \
+ compiler_assign_ptr_val(p, old); \
}

DEFINE_BITOP(set_bits, or, "")
@@ -117,6 +118,7 @@ static __inline__ unsigned long fn( \
: "=&r" (old), "=&r" (t) \
: "r" (mask), "r" (p) \
: "cc", "memory"); \
+ compiler_assign_ptr_val(p, old); \
return (old & mask); \
}

diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h
index b8da91363864..be965e6c428a 100644
--- a/arch/powerpc/include/asm/local.h
+++ b/arch/powerpc/include/asm/local.h
@@ -33,6 +33,8 @@ static __inline__ long local_add_return(long a, local_t *l)
: "r" (a), "r" (&(l->a.counter))
: "cc", "memory");

+ compiler_assign_ptr_val(&(l->a.counter), t);
+
return t;
}

@@ -52,6 +54,8 @@ static __inline__ long local_sub_return(long a, local_t *l)
: "r" (a), "r" (&(l->a.counter))
: "cc", "memory");

+ compiler_assign_ptr_val(&(l->a.counter), t);
+
return t;
}

@@ -69,6 +73,8 @@ static __inline__ long local_inc_return(local_t *l)
: "r" (&(l->a.counter))
: "cc", "xer", "memory");

+ compiler_assign_ptr_val(&(l->a.counter), t);
+
return t;
}

@@ -96,6 +102,8 @@ static __inline__ long local_dec_return(local_t *l)
: "r" (&(l->a.counter))
: "cc", "xer", "memory");

+ compiler_assign_ptr_val(&(l->a.counter), t);
+
return t;
}

@@ -130,6 +138,8 @@ static __inline__ int local_add_unless(local_t *l, long a, long u)
: "r" (&(l->a.counter)), "r" (a), "r" (u)
: "cc", "memory");

+ compiler_assign_ptr_val(&(l->a.counter), t);
+
return t != u;
}

@@ -159,6 +169,8 @@ static __inline__ long local_dec_if_positive(local_t *l)
: "r" (&(l->a.counter))
: "cc", "memory");

+ compiler_assign_ptr_val(&(l->a.counter), t);
+
return t;
}

diff --git a/include/asm-generic/bitops/non-atomic.h b/include/asm-generic/bitops/non-atomic.h
index 697cc2b7e0f0..e8b388b98309 100644
--- a/include/asm-generic/bitops/non-atomic.h
+++ b/include/asm-generic/bitops/non-atomic.h
@@ -100,7 +100,7 @@ static inline int __test_and_change_bit(int nr,
* @nr: bit number to test
* @addr: Address to start counting from
*/
-static inline int test_bit(int nr, const volatile unsigned long *addr)
+static inline int test_bit(int nr, const unsigned long *addr)
{
return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
}
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index cf0fa5d86059..b31353934c6a 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -205,6 +205,25 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
= (unsigned long)&sym;
#endif

+/*
+ * Inform the compiler when the value of a pointer is known.
+ * This can be useful when the caller knows the value but the compiler does
+ * not. Typically, when assembly is used.
+ *
+ * val should be a variable that's likely to be in a register or an immediate,
+ * or a constant.
+ *
+ * This should be used carefully, verifying improvements in generated code.
+ * This is not a hint. It will cause bugs if it is used incorrectly.
+ */
+#ifndef compiler_assign_ptr_val
+# define compiler_assign_ptr_val(ptr, val) \
+do { \
+ if (*(ptr) != (val)) \
+ unreachable(); \
+} while (0)
+#endif
+
#ifndef RELOC_HIDE
# define RELOC_HIDE(ptr, off) \
({ unsigned long __ptr; \
diff --git a/mm/filemap.c b/mm/filemap.c
index 82f26cde830c..0e7d9008e95f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -929,7 +929,7 @@ void unlock_page(struct page *page)
page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
- smp_mb__after_atomic();
+ // smp_mb__after_atomic();
wake_up_page(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);
--
2.11.0

2016-12-27 18:59:09

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Tue, Dec 27, 2016 at 3:19 AM, Nicholas Piggin <[email protected]> wrote:
>
> Attached is part of a patch I've been mulling over for a while. I
> expect you to hate it, and it does not solve this problem for x86,
> but I like being able to propagate values from atomic ops back
> to the compiler. Of course, volatile then can't be used either which
> is another spanner...

Yeah, that patch is disgusting, and doesn't even help x86. It also
depends on the compiler doing the right thing in ways that are not
obviously true.

I'd much rather just add the "xyz_return()" primitives for and/or, the
way we already have atomic_add_return() and friends.

In fact, we could probably play games with bit numbering, and actually
use the atomic ops we already have. For example, if the lock bit was
the top bit, we could unlock by doing "atomic_add_return()" with that
bit, and look at the remaining bits that way.

That would actually work really well on x86, since there we have
"xadd", but we do *not* have "set/clear bit and return old word".

We could make a special case for just the page lock bit, make it bit #7, and use

movb $128,%al
lock xaddb %al,flags

and then test the bits in %al.

And all the RISC architectures would be ok with that too, because they
can just use ll/sc and test the bits with that. So for them, adding a
"atomic_long_and_return()" would be very natural in the general case.

Hmm?

The other alternative is to keep the lock bit as bit #0, and just make
the contention bit be the high bit. Then, on x86, you can do

lock andb $0xfe,flags
js contention

which might be even better. Again, it would be a very special
operation just for unlock. Something like

bit_clear_and_branch_if_negative_byte(mem, label);

and again, it would be trivial to do on most architectures.

Let me try to write a patch or two for testing.

Linus

2016-12-27 19:30:03

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Tue, Dec 27, 2016 at 10:58 AM, Linus Torvalds
<[email protected]> wrote:
>
> The other alternative is to keep the lock bit as bit #0, and just make
> the contention bit be the high bit. Then, on x86, you can do
>
> lock andb $0xfe,flags
> js contention
>
> which might be even better. Again, it would be a very special
> operation just for unlock. Something like
>
> bit_clear_and_branch_if_negative_byte(mem, label);
>
> and again, it would be trivial to do on most architectures.
>
> Let me try to write a patch or two for testing.

Ok, that was easy.

Of course, none of this is *tested*, but it looks superficially
correct, and allows other architectures to do the same optimization if
they want.

On x86, the unlock_page() code now generates

lock; andb $1,(%rdi) #, MEM[(volatile long int *)_7]
js .L114 #,
popq %rbp #
ret

for the actual unlock itself.

Now to actually compile the whole thing and see if it boots..

Linus

2016-12-27 19:33:22

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Tue, Dec 27, 2016 at 11:23 AM, Linus Torvalds
<[email protected]> wrote:
>
> Of course, none of this is *tested*, but it looks superficially
> correct, and allows other architectures to do the same optimization if
> they want.

Oops. I should include the actual patch I was talking about too, shouldn't I?

Linus


Attachments:
patch.diff (2.75 kB)

2016-12-27 19:41:38

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Tue, Dec 27, 2016 at 11:24 AM, Linus Torvalds
<[email protected]> wrote:
>
> Oops. I should include the actual patch I was talking about too, shouldn't I?

And that patch was completely buggy. The mask for the "and" was computed as

+ : "Ir" (1 << nr) : "memory");

but that clears every bit *except* for the one we actually want to
clear. I even posted the code it generates:

lock; andb $1,(%rdi) #, MEM[(volatile long int *)_7]
js .L114 #,

which is obviously crap.

The mask needs to be inverted, of course, and the constraint should be
"ir" (not "Ir" - the "I" is for shift constants) so it should be

+ : "ir" ((char) ~(1 << nr)) : "memory");

new patch attached (but still entirely untested, so caveat emptor).

This patch at least might have a chance in hell of working. Let's see..

Linus


Attachments:
patch.diff (2.76 kB)

2016-12-27 20:17:42

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Tue, Dec 27, 2016 at 11:40 AM, Linus Torvalds
<[email protected]> wrote:
>
> This patch at least might have a chance in hell of working. Let's see..

Ok, with that fixed, things do indeed seem to work.

And things also look fairly good on my "lots of nasty little
shortlived scripts" benchmark ("make -j32 test" for git, in case
people care).

That benchmark used to have "unlock_page()" and "__wake_up_bit()"
together using about 3% of all CPU time.

Now __wake_up_bit() doesn't show up at all (ok, it's something like
0.02%, so it's technically still there, but..) and "unlock_page()" is
at 0.66% of CPU time. So it's about a quarter of where it used to be.
And now it's about the same cost as the "try_lock_page() that is
inlined into filemap_map_pages() - it used to be that unlocking the
page was much more expensive than locking it because of all the
unnecessary waitqueue games.

So the benchmark still does a ton of page lock/unlock action, but it
doesn't stand out in the profiles as some kind of WTF thing any more.
And the profiles really show that the cost is the atomic op itself
rather than bad effects from bad code generation, which is what you
want to see.

Would I love to fix this all by not taking the page lock at all? Yes I
would. I suspect we should be able to do something clever and lockless
at least in theory.

But in the meantime, I'm happy with where our page locking overhead
is. And while I haven't seen the NUMA numbers from Dave Hansen with
this all, the early testing from Dave was that the original patch from
Nick already fixed the regression and was the fastest one anyway. And
this optimization will only have improved on things further, although
it might not be as noticeable on NUMA as it is on just a regular
single socket system.

Linus

2016-12-28 03:54:42

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Tue, 27 Dec 2016 10:58:59 -0800
Linus Torvalds <[email protected]> wrote:

> On Tue, Dec 27, 2016 at 3:19 AM, Nicholas Piggin <[email protected]> wrote:
> >
> > Attached is part of a patch I've been mulling over for a while. I
> > expect you to hate it, and it does not solve this problem for x86,
> > but I like being able to propagate values from atomic ops back
> > to the compiler. Of course, volatile then can't be used either which
> > is another spanner...
>
> Yeah, that patch is disgusting, and doesn't even help x86.

No, although it would help some cases (but granted the bitops tend to
be problematic in this regard). To be clear I'm not asking to merge it,
just wondered your opinion. (We need something more for unlock_page
anyway because the memory barrier in the way).

> It also
> depends on the compiler doing the right thing in ways that are not
> obviously true.

Can you elaborate on this? GCC will do the optimization (modulo a
regression https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77647)

> I'd much rather just add the "xyz_return()" primitives for and/or, the
> way we already have atomic_add_return() and friends.
>
> In fact, we could probably play games with bit numbering, and actually
> use the atomic ops we already have. For example, if the lock bit was
> the top bit, we could unlock by doing "atomic_add_return()" with that
> bit, and look at the remaining bits that way.
>
> That would actually work really well on x86, since there we have
> "xadd", but we do *not* have "set/clear bit and return old word".
>
> We could make a special case for just the page lock bit, make it bit #7, and use
>
> movb $128,%al
> lock xaddb %al,flags
>
> and then test the bits in %al.
>
> And all the RISC architectures would be ok with that too, because they
> can just use ll/sc and test the bits with that. So for them, adding a
> "atomic_long_and_return()" would be very natural in the general case.
>
> Hmm?
>
> The other alternative is to keep the lock bit as bit #0, and just make
> the contention bit be the high bit. Then, on x86, you can do
>
> lock andb $0xfe,flags
> js contention
>
> which might be even better. Again, it would be a very special
> operation just for unlock. Something like
>
> bit_clear_and_branch_if_negative_byte(mem, label);
>
> and again, it would be trivial to do on most architectures.
>
> Let me try to write a patch or two for testing.

Patch seems okay, but it's kind of a horrible primitive. What if you
did clear_bit_unlock_and_test_bit, which does a __builtin_constant_p
test on the bit numbers and if they are < 7 and == 7, then do the
fastpath?

Nitpick, can the enum do "= 7" to catch careless bugs? Or BUILD_BUG_ON.

And I'd to do the same for PG_writeback. AFAIKS whatever approach is
used for PG_locked should work just the same, so no problem there.

Thanks,
Nick

2016-12-28 19:17:34

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Tue, Dec 27, 2016 at 7:53 PM, Nicholas Piggin <[email protected]> wrote:
>>
>> Yeah, that patch is disgusting, and doesn't even help x86.
>
> No, although it would help some cases (but granted the bitops tend to
> be problematic in this regard). To be clear I'm not asking to merge it,
> just wondered your opinion. (We need something more for unlock_page
> anyway because the memory barrier in the way).

The thing is, the patch seems pointless anyway. The "add_return()"
kind of cases already return the value, so any code that cares can
just use that. And the other cases are downright incorrect, like the
removal of "volatile" from the bit test ops.

>> It also
>> depends on the compiler doing the right thing in ways that are not
>> obviously true.
>
> Can you elaborate on this? GCC will do the optimization (modulo a
> regression https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77647)

So the removal of volatile is just one example of that. You're
basically forcing magical side effects. I've never seen this trick
_documented_, and quite frankly, the gcc people have had a history of
changing their own documentation when it came to their own extensions
(ie they've changed how inline functions work etc).

But I also worry about interactions with different gcc versions, or
with the LLVM people who try to build the kernel with non-gcc
compilers.

Finally, it fundamentally can't work on x86 anyway, except for the
add-return type of operations, which by definitions are pointless (see
above).

So everything just screams "this is a horrible approach" to me.

> Patch seems okay, but it's kind of a horrible primitive. What if you
> did clear_bit_unlock_and_test_bit, which does a __builtin_constant_p
> test on the bit numbers and if they are < 7 and == 7, then do the
> fastpath?

So the problem with that is that it makes no sense *except* in the
case where the bit is 7. So why add a "generic" function for something
that really isn't generic?

I agree that it's a hacky interface, but I also happen to believe that
being explicit about what you are actually doing causes less pain.
It's not magical, and it's not subtle. There's no question about what
it does behind your back, and people won't use it by mistake in the
wrong context where it doesn't actually work any better than just
doing the obvious thing.

Linus

2016-12-29 04:09:04

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Wed, 28 Dec 2016 11:17:00 -0800
Linus Torvalds <[email protected]> wrote:

> On Tue, Dec 27, 2016 at 7:53 PM, Nicholas Piggin <[email protected]> wrote:
> >>
> >> Yeah, that patch is disgusting, and doesn't even help x86.
> >
> > No, although it would help some cases (but granted the bitops tend to
> > be problematic in this regard). To be clear I'm not asking to merge it,
> > just wondered your opinion. (We need something more for unlock_page
> > anyway because the memory barrier in the way).
>
> The thing is, the patch seems pointless anyway. The "add_return()"
> kind of cases already return the value, so any code that cares can
> just use that. And the other cases are downright incorrect, like the
> removal of "volatile" from the bit test ops.

Yeah that's true, but you can't carry that over multiple multiple
primitives. For bitops it's often the case you get several bitops
on the same word close together.

>
> >> It also
> >> depends on the compiler doing the right thing in ways that are not
> >> obviously true.
> >
> > Can you elaborate on this? GCC will do the optimization (modulo a
> > regression https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77647)
>
> So the removal of volatile is just one example of that. You're
> basically forcing magical side effects. I've never seen this trick
> _documented_, and quite frankly, the gcc people have had a history of
> changing their own documentation when it came to their own extensions
> (ie they've changed how inline functions work etc).
>
> But I also worry about interactions with different gcc versions, or
> with the LLVM people who try to build the kernel with non-gcc
> compilers.
>
> Finally, it fundamentally can't work on x86 anyway, except for the
> add-return type of operations, which by definitions are pointless (see
> above).
>
> So everything just screams "this is a horrible approach" to me.

You're probably right. The few cases where it matters may just be served
with special primitives.

>
> > Patch seems okay, but it's kind of a horrible primitive. What if you
> > did clear_bit_unlock_and_test_bit, which does a __builtin_constant_p
> > test on the bit numbers and if they are < 7 and == 7, then do the
> > fastpath?
>
> So the problem with that is that it makes no sense *except* in the
> case where the bit is 7. So why add a "generic" function for something
> that really isn't generic?

Yeah you're also right, I kind of realized after hitting send.

>
> I agree that it's a hacky interface, but I also happen to believe that
> being explicit about what you are actually doing causes less pain.
> It's not magical, and it's not subtle. There's no question about what
> it does behind your back, and people won't use it by mistake in the
> wrong context where it doesn't actually work any better than just
> doing the obvious thing.

Okay. The name could be a bit better though I think, for readability.
Just a BUILD_BUG_ON if it is not constant and correct bit numbers?

BTW. I just notice in your patch too that you didn't use "nr" in the
generic version.

Thanks,
Nick

2016-12-29 04:16:59

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Wed, Dec 28, 2016 at 8:08 PM, Nicholas Piggin <[email protected]> wrote:
>
> Okay. The name could be a bit better though I think, for readability.
> Just a BUILD_BUG_ON if it is not constant and correct bit numbers?

I have a slightly edited patch - moved the comments around and added
some new comments (about both the sign bit, but also about how the
smp_mb() shouldn't be necessary even for the non-atomic fallback).

I also did a BUILD_BUG_ON(), except the other way around - keeping it
about the sign bit in the byte, just just verifying that yes,
PG_waiters is that sign bit.

> BTW. I just notice in your patch too that you didn't use "nr" in the
> generic version.

And I fixed that too.

Of course, I didn't test the changes (apart from building it). But
I've been running the previous version since yesterday, so far no
issues.

Linus


Attachments:
patch.diff (3.66 kB)

2016-12-29 05:26:35

by Nicholas Piggin

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: add PageWaiters indicating tasks are waiting for a page bit

On Wed, 28 Dec 2016 20:16:56 -0800
Linus Torvalds <[email protected]> wrote:

> On Wed, Dec 28, 2016 at 8:08 PM, Nicholas Piggin <[email protected]> wrote:
> >
> > Okay. The name could be a bit better though I think, for readability.
> > Just a BUILD_BUG_ON if it is not constant and correct bit numbers?
>
> I have a slightly edited patch - moved the comments around and added
> some new comments (about both the sign bit, but also about how the
> smp_mb() shouldn't be necessary even for the non-atomic fallback).

That's a good point -- they're in the same byte, so all architectures
will be able to avoid the extra barrier regardless of how the
primitives are implemented. Good.

>
> I also did a BUILD_BUG_ON(), except the other way around - keeping it
> about the sign bit in the byte, just just verifying that yes,
> PG_waiters is that sign bit.

Yep. I still don't like the name, but now you've got PG_waiters
commented there at least. I'll have to live with it.

If we get more cases that want to use a similar function, we might make
a more general primitive for architectures that can optimize these multi
bit ops better than x86. Actually even x86 would prefer to do load ;
cmpxchg rather than bitop ; load for the cases where condition code can't
be used to check result.

>
> > BTW. I just notice in your patch too that you didn't use "nr" in the
> > generic version.
>
> And I fixed that too.
>
> Of course, I didn't test the changes (apart from building it). But
> I've been running the previous version since yesterday, so far no
> issues.

It looks good to me.

Thanks,
Nick

2016-12-29 22:16:17

by Olof Johansson

[permalink] [raw]
Subject: [PATCH] mm/filemap: fix parameters to test_bit()

mm/filemap.c: In function 'clear_bit_unlock_is_negative_byte':
mm/filemap.c:933:9: error: too few arguments to function 'test_bit'
return test_bit(PG_waiters);
^~~~~~~~
In file included from arch/arm/include/asm/bitops.h:122:0,
from include/linux/bitops.h:36,
from include/linux/kernel.h:10,
from include/linux/list.h:8,
from include/linux/wait.h:6,
from include/linux/fs.h:5,
from lude/linux/dax.h:4,
from mm/filemap.c:14:
include/asm-generic/bitops/non-atomic.h:103:19: note: declared here
static inline int test_bit(int nr, const volatile unsigned long *addr)
^~~~~~~~
mm/filemap.c:934:1: warning: control reaches end of non-void function [-Wreturn-type]
}
^
scripts/Makefile.build:293: recipe for target 'mm/filemap.o' failed

Fixes: b91e1302ad9b ('mm: optimize PageWaiters bit use for unlock_page()')
Signed-off-by: Olof Johansson <[email protected]>
---
mm/filemap.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 6b1d96f..d0e4d10 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -930,7 +930,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
{
clear_bit_unlock(nr, mem);
/* smp_mb__after_atomic(); */
- return test_bit(PG_waiters);
+ return test_bit(PG_waiters, mem);
}

#endif
--
2.8.0.rc3.29.gb552ff8