Both with frontswap/zswap, and with some extremely fast IO devices,
swap IO will be done before the "asynchronous" swap_readpage() call
has returned.
In that case, doing swap readahead only wastes memory, increases
latency, and increases the chances of needing to evict something more
useful from memory. In that case, just skip swap readahead.
Split swap single page readahead into its own function, to make
the next patch easier to read. No functional changes.
Signed-off-by: Rik van Riel <[email protected]>
---
mm/swap_state.c | 40 +++++++++++++++++++++++++---------------
1 file changed, 25 insertions(+), 15 deletions(-)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c16eebb81d8b..aacb9ba53f63 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -594,6 +594,28 @@ static unsigned long swapin_nr_pages(unsigned long offset)
return pages;
}
+static struct page *swap_cluster_read_one(swp_entry_t entry,
+ unsigned long offset, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr, bool readahead)
+{
+ bool page_allocated;
+ struct page *page;
+
+ page =__read_swap_cache_async(swp_entry(swp_type(entry), offset),
+ gfp_mask, vma, addr, &page_allocated);
+ if (!page)
+ return NULL;
+ if (page_allocated) {
+ swap_readpage(page, false);
+ if (readahead) {
+ SetPageReadahead(page);
+ count_vm_event(SWAP_RA);
+ }
+ }
+ put_page(page);
+ return page;
+}
+
/**
* swap_cluster_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
@@ -615,14 +637,13 @@ static unsigned long swapin_nr_pages(unsigned long offset)
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
- struct page *page;
unsigned long entry_offset = swp_offset(entry);
unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
unsigned long mask;
struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
- bool do_poll = true, page_allocated;
+ bool do_poll = true;
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address;
@@ -649,19 +670,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
blk_start_plug(&plug);
for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
- page = __read_swap_cache_async(
- swp_entry(swp_type(entry), offset),
- gfp_mask, vma, addr, &page_allocated);
- if (!page)
- continue;
- if (page_allocated) {
- swap_readpage(page, false);
- if (offset != entry_offset) {
- SetPageReadahead(page);
- count_vm_event(SWAP_RA);
- }
- }
- put_page(page);
+ swap_cluster_read_one(entry, offset, gfp_mask, vma, addr,
+ offset != entry_offset);
}
blk_finish_plug(&plug);
--
2.25.4
Check whether a swap page was obtained instantaneously, for example
because it is in zswap, or on a very fast IO device which uses busy
waiting, and we did not wait on IO to swap in this page.
If no IO was needed to get the swap page we want, kicking off readahead
on surrounding swap pages is likely to be counterproductive, because the
extra loads will cause additional latency, use up extra memory, and chances
are the surrounding pages in swap are just as fast to load as this one,
making readahead pointless.
Signed-off-by: Rik van Riel <[email protected]>
---
mm/swap_state.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index aacb9ba53f63..6919f9d5fe88 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -637,6 +637,7 @@ static struct page *swap_cluster_read_one(swp_entry_t entry,
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
+ struct page *page;
unsigned long entry_offset = swp_offset(entry);
unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
@@ -668,11 +669,18 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
end_offset = si->max - 1;
blk_start_plug(&plug);
+ /* If we read the page without waiting on IO, skip readahead. */
+ page = swap_cluster_read_one(entry, offset, gfp_mask, vma, addr, false);
+ if (page && PageUptodate(page))
+ goto skip_unplug;
+
+ /* Ok, do the async read-ahead now. */
for (offset = start_offset; offset <= end_offset ; offset++) {
- /* Ok, do the async read-ahead now */
- swap_cluster_read_one(entry, offset, gfp_mask, vma, addr,
- offset != entry_offset);
+ if (offset == entry_offset)
+ continue;
+ swap_cluster_read_one(entry, offset, gfp_mask, vma, addr, true);
}
+skip_unplug:
blk_finish_plug(&plug);
lru_add_drain(); /* Push any new pages onto the LRU now */
--
2.25.4
On Tue, Sep 22, 2020 at 10:02 AM Rik van Riel <[email protected]> wrote:
>
> Check whether a swap page was obtained instantaneously, for example
> because it is in zswap, or on a very fast IO device which uses busy
> waiting, and we did not wait on IO to swap in this page.
> If no IO was needed to get the swap page we want, kicking off readahead
> on surrounding swap pages is likely to be counterproductive, because the
> extra loads will cause additional latency, use up extra memory, and chances
> are the surrounding pages in swap are just as fast to load as this one,
> making readahead pointless.
>
> Signed-off-by: Rik van Riel <[email protected]>
> ---
> mm/swap_state.c | 14 +++++++++++---
> 1 file changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index aacb9ba53f63..6919f9d5fe88 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -637,6 +637,7 @@ static struct page *swap_cluster_read_one(swp_entry_t entry,
> struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> struct vm_fault *vmf)
Why not do this for swap_vma_readahead() too? swap_cluster_read_one()
can be used in swap_vma_readahead() too.
> {
> + struct page *page;
> unsigned long entry_offset = swp_offset(entry);
> unsigned long offset = entry_offset;
> unsigned long start_offset, end_offset;
> @@ -668,11 +669,18 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> end_offset = si->max - 1;
>
> blk_start_plug(&plug);
> + /* If we read the page without waiting on IO, skip readahead. */
> + page = swap_cluster_read_one(entry, offset, gfp_mask, vma, addr, false);
> + if (page && PageUptodate(page))
> + goto skip_unplug;
> +
> + /* Ok, do the async read-ahead now. */
> for (offset = start_offset; offset <= end_offset ; offset++) {
> - /* Ok, do the async read-ahead now */
> - swap_cluster_read_one(entry, offset, gfp_mask, vma, addr,
> - offset != entry_offset);
> + if (offset == entry_offset)
> + continue;
> + swap_cluster_read_one(entry, offset, gfp_mask, vma, addr, true);
> }
> +skip_unplug:
> blk_finish_plug(&plug);
>
> lru_add_drain(); /* Push any new pages onto the LRU now */
Best Regards,
Huang, Ying
On Tue, 2020-09-22 at 11:13 +0800, huang ying wrote:
> On Tue, Sep 22, 2020 at 10:02 AM Rik van Riel <[email protected]>
> wrote:
> > Check whether a swap page was obtained instantaneously, for example
> > because it is in zswap, or on a very fast IO device which uses busy
> > waiting, and we did not wait on IO to swap in this page.
> > If no IO was needed to get the swap page we want, kicking off
> > readahead
> > on surrounding swap pages is likely to be counterproductive,
> > because the
> > extra loads will cause additional latency, use up extra memory, and
> > chances
> > are the surrounding pages in swap are just as fast to load as this
> > one,
> > making readahead pointless.
> >
> > Signed-off-by: Rik van Riel <[email protected]>
> > ---
> > mm/swap_state.c | 14 +++++++++++---
> > 1 file changed, 11 insertions(+), 3 deletions(-)
> >
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index aacb9ba53f63..6919f9d5fe88 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -637,6 +637,7 @@ static struct page
> > *swap_cluster_read_one(swp_entry_t entry,
> > struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t
> > gfp_mask,
> > struct vm_fault *vmf)
>
> Why not do this for swap_vma_readahead()
> too? swap_cluster_read_one()
> can be used in swap_vma_readahead() too.
Good point, I should do the same thing for swap_vma_readahead()
as well. Let me do that and send in a version 2 of the series.
--
All Rights Reversed.
On Mon, 21 Sep 2020 22:01:46 -0400 Rik van Riel <[email protected]> wrote:
> Both with frontswap/zswap, and with some extremely fast IO devices,
> swap IO will be done before the "asynchronous" swap_readpage() call
> has returned.
>
> In that case, doing swap readahead only wastes memory, increases
> latency, and increases the chances of needing to evict something more
> useful from memory. In that case, just skip swap readahead.
Any quantitative testing results?
On Mon, Sep 21, 2020 at 10:01:47PM -0400, Rik van Riel wrote:
> +static struct page *swap_cluster_read_one(swp_entry_t entry,
> + unsigned long offset, gfp_t gfp_mask,
> + struct vm_area_struct *vma, unsigned long addr, bool readahead)
> +{
> + bool page_allocated;
> + struct page *page;
> +
> + page =__read_swap_cache_async(swp_entry(swp_type(entry), offset),
> + gfp_mask, vma, addr, &page_allocated);
Missing whitespace after the "=".
> + if (!page)
> + return NULL;
> + if (page_allocated) {
> + swap_readpage(page, false);
> + if (readahead) {
> + SetPageReadahead(page);
> + count_vm_event(SWAP_RA);
> + }
> + }
> + put_page(page);
> + return page;
> +}
I think swap_vma_readahead can be switched to your new helper
pretty trivially as well, as could many of the users of
read_swap_cache_async.
On Mon, Sep 21, 2020 at 10:01:48PM -0400, Rik van Riel wrote:
> + struct page *page;
> unsigned long entry_offset = swp_offset(entry);
> unsigned long offset = entry_offset;
> unsigned long start_offset, end_offset;
> @@ -668,11 +669,18 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> end_offset = si->max - 1;
>
> blk_start_plug(&plug);
> + /* If we read the page without waiting on IO, skip readahead. */
> + page = swap_cluster_read_one(entry, offset, gfp_mask, vma, addr, false);
> + if (page && PageUptodate(page))
> + goto skip_unplug;
> +
At least for the normal block device path the plug will prevent the
I/O submission from actually happening and thus PageUptodate from
becoming true. I think we need to split the different code paths
more cleanly.
Btw, what device type and media did you test this with? What kind of
numbers did you get on what workload?
On Tue, 2020-09-22 at 10:12 -0700, Andrew Morton wrote:
> On Mon, 21 Sep 2020 22:01:46 -0400 Rik van Riel <[email protected]>
> wrote:
>
> > Both with frontswap/zswap, and with some extremely fast IO devices,
> > swap IO will be done before the "asynchronous" swap_readpage() call
> > has returned.
> >
> > In that case, doing swap readahead only wastes memory, increases
> > latency, and increases the chances of needing to evict something
> > more
> > useful from memory. In that case, just skip swap readahead.
>
> Any quantitative testing results?
I have test results with a real workload now.
Without this patch, enabling zswap results in about an
8% increase in p99 request latency. With these patches,
the latency penalty for enabling zswap is under 1%.
Enabling zswap
allows us to give the main workload a
little more memory, since the spikes in memory demand
caused by things like system management software no
longer cause large latency issues.
--
All Rights Reversed.
On Mon, 2020-10-05 at 13:32 -0400, Rik van Riel wrote:
> On Tue, 2020-09-22 at 10:12 -0700, Andrew Morton wrote:
> > On Mon, 21 Sep 2020 22:01:46 -0400 Rik van Riel <[email protected]>
> > wrote:
> > Any quantitative testing results?
>
> I have test results with a real workload now.
>
> Without this patch, enabling zswap results in about an
> 8% increase in p99 request latency. With these patches,
> the latency penalty for enabling zswap is under 1%.
Never mind that. On larger tests the effect seems to disappear,
probably because the logic in __swapin_nr_pages() already reduces
the number of pages read ahead to 2 on workloads with lots of
random access.
That reduces the latency effects observed.
Now we might
still see some memory waste due to decompressing
pages we don't need, but I have not seen any real effects from
that yet, either.
I think it may be time to focus on a larger memory waste with
zswap: leaving the compressed copy of memory around when we
decompress the memory at swapin time. More aggressively freeing
the compressed memory will probably buy us more than reducing
readahead.
--
All Rights Reversed.