From: Kairui Song <[email protected]>
This makes swapin_readahead a main entry for swapin pages,
prepare for optimizations in later commits.
This also makes swapoff able to make use of readahead checking
based on entry. Swapping off a 10G ZRAM (lzo-rle) is faster:
Before:
time swapoff /dev/zram0
real 0m12.337s
user 0m0.001s
sys 0m12.329s
After:
time swapoff /dev/zram0
real 0m9.728s
user 0m0.001s
sys 0m9.719s
And what's more, because now swapoff will also make use of no-readahead
swapin helper, this also fixed a bug for no-readahead case (eg. ZRAM):
when a process that swapped out some memory previously was moved to a new
cgroup, and the original cgroup is dead, swapoff the swap device will
make the swapped in pages accounted into the process doing the swapoff
instead of the new cgroup the process was moved to.
This can be easily reproduced by:
- Setup a ramdisk (eg. ZRAM) swap.
- Create memory cgroup A, B and C.
- Spawn process P1 in cgroup A and make it swap out some pages.
- Move process P1 to memory cgroup B.
- Destroy cgroup A.
- Do a swapoff in cgroup C.
- Swapped in pages is accounted into cgroup C.
This patch will fix it make the swapped in pages accounted in cgroup B.
The same bug exists for readahead path too, we'll fix it in later
commits.
Signed-off-by: Kairui Song <[email protected]>
---
mm/memory.c | 22 +++++++---------------
mm/swap.h | 6 ++----
mm/swap_state.c | 33 ++++++++++++++++++++++++++-------
mm/swapfile.c | 2 +-
4 files changed, 36 insertions(+), 27 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index fba4a5229163..f4237a2e3b93 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3792,6 +3792,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
rmap_t rmap_flags = RMAP_NONE;
bool exclusive = false;
swp_entry_t entry;
+ bool swapcached;
pte_t pte;
vm_fault_t ret = 0;
@@ -3855,22 +3856,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
swapcache = folio;
if (!folio) {
- if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
- __swap_count(entry) == 1) {
- /* skip swapcache and readahead */
- page = swapin_no_readahead(entry, GFP_HIGHUSER_MOVABLE,
- vmf);
- if (page)
- folio = page_folio(page);
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ vmf, &swapcached);
+ if (page) {
+ folio = page_folio(page);
+ if (swapcached)
+ swapcache = folio;
} else {
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
- vmf);
- if (page)
- folio = page_folio(page);
- swapcache = folio;
- }
-
- if (!folio) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
diff --git a/mm/swap.h b/mm/swap.h
index ea4be4791394..f82d43d7b52a 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -55,9 +55,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
- struct vm_fault *vmf);
-struct page *swapin_no_readahead(swp_entry_t entry, gfp_t flag,
- struct vm_fault *vmf);
+ struct vm_fault *vmf, bool *swapcached);
static inline unsigned int folio_swap_flags(struct folio *folio)
{
@@ -89,7 +87,7 @@ static inline struct page *swap_cluster_readahead(swp_entry_t entry,
}
static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
- struct vm_fault *vmf)
+ struct vm_fault *vmf, bool *swapcached)
{
return NULL;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 45dd8b7c195d..fd0047ae324e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -316,6 +316,11 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
release_pages(pages, nr);
}
+static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_t entry)
+{
+ return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
+}
+
static inline bool swap_use_vma_readahead(void)
{
return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
@@ -861,8 +866,8 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
* Returns the struct page for entry and addr after the swap entry is read
* in.
*/
-struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_fault *vmf)
+static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
@@ -904,6 +909,8 @@ struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
* @entry: swap entry of this memory
* @gfp_mask: memory allocation flags
* @vmf: fault information
+ * @swapcached: pointer to a bool used as indicator if the
+ * page is swapped in through swapcache.
*
* Returns the struct page for entry and addr, after queueing swapin.
*
@@ -912,17 +919,29 @@ struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
* or vma-based(ie, virtual address based on faulty address) readahead.
*/
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_fault *vmf)
+ struct vm_fault *vmf, bool *swapcached)
{
struct mempolicy *mpol;
- pgoff_t ilx;
struct page *page;
+ pgoff_t ilx;
+ bool cached;
mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
- page = swap_use_vma_readahead() ?
- swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
- swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+ if (swap_use_no_readahead(swp_swap_info(entry), entry)) {
+ page = swapin_no_readahead(entry, gfp_mask, vmf);
+ cached = false;
+ } else if (swap_use_vma_readahead()) {
+ page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
+ cached = true;
+ } else {
+ page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+ cached = true;
+ }
mpol_cond_put(mpol);
+
+ if (swapcached)
+ *swapcached = cached;
+
return page;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 756104ebd585..0142bfc71b81 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1874,7 +1874,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
};
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
- &vmf);
+ &vmf, NULL);
if (page)
folio = page_folio(page);
}
--
2.42.0
On Sun, Nov 19, 2023 at 11:48 AM Kairui Song <[email protected]> wrote:
>
> From: Kairui Song <[email protected]>
>
> This makes swapin_readahead a main entry for swapin pages,
> prepare for optimizations in later commits.
>
> This also makes swapoff able to make use of readahead checking
> based on entry. Swapping off a 10G ZRAM (lzo-rle) is faster:
>
> Before:
> time swapoff /dev/zram0
> real 0m12.337s
> user 0m0.001s
> sys 0m12.329s
>
> After:
> time swapoff /dev/zram0
> real 0m9.728s
> user 0m0.001s
> sys 0m9.719s
>
> And what's more, because now swapoff will also make use of no-readahead
> swapin helper, this also fixed a bug for no-readahead case (eg. ZRAM):
> when a process that swapped out some memory previously was moved to a new
> cgroup, and the original cgroup is dead, swapoff the swap device will
> make the swapped in pages accounted into the process doing the swapoff
> instead of the new cgroup the process was moved to.
>
> This can be easily reproduced by:
> - Setup a ramdisk (eg. ZRAM) swap.
> - Create memory cgroup A, B and C.
> - Spawn process P1 in cgroup A and make it swap out some pages.
> - Move process P1 to memory cgroup B.
> - Destroy cgroup A.
> - Do a swapoff in cgroup C.
> - Swapped in pages is accounted into cgroup C.
>
> This patch will fix it make the swapped in pages accounted in cgroup B.
Can you help me understand where the code makes this behavior change?
As far as I can tell, the patch just allows swap off to use the code
path to avoid read ahead and avoid swap cache path. I haven't figured
out where the code makes the swapin charge to B.
Does it need to be ZRAM? Will zswap or SSD work in your example?
>
> The same bug exists for readahead path too, we'll fix it in later
> commits.
As discussed in another email, this behavior change is against the
current memcg memory charging model.
Better separate out this behavior change with code clean up.
>
> Signed-off-by: Kairui Song <[email protected]>
> ---
> mm/memory.c | 22 +++++++---------------
> mm/swap.h | 6 ++----
> mm/swap_state.c | 33 ++++++++++++++++++++++++++-------
> mm/swapfile.c | 2 +-
> 4 files changed, 36 insertions(+), 27 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index fba4a5229163..f4237a2e3b93 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3792,6 +3792,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> rmap_t rmap_flags = RMAP_NONE;
> bool exclusive = false;
> swp_entry_t entry;
> + bool swapcached;
> pte_t pte;
> vm_fault_t ret = 0;
>
> @@ -3855,22 +3856,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> swapcache = folio;
>
> if (!folio) {
> - if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
> - __swap_count(entry) == 1) {
> - /* skip swapcache and readahead */
> - page = swapin_no_readahead(entry, GFP_HIGHUSER_MOVABLE,
> - vmf);
> - if (page)
> - folio = page_folio(page);
> + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> + vmf, &swapcached);
> + if (page) {
> + folio = page_folio(page);
> + if (swapcached)
> + swapcache = folio;
> } else {
> - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> - vmf);
> - if (page)
> - folio = page_folio(page);
> - swapcache = folio;
> - }
> -
> - if (!folio) {
> /*
> * Back out if somebody else faulted in this pte
> * while we released the pte lock.
> diff --git a/mm/swap.h b/mm/swap.h
> index ea4be4791394..f82d43d7b52a 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -55,9 +55,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
> struct mempolicy *mpol, pgoff_t ilx);
> struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
> - struct vm_fault *vmf);
> -struct page *swapin_no_readahead(swp_entry_t entry, gfp_t flag,
> - struct vm_fault *vmf);
> + struct vm_fault *vmf, bool *swapcached);
>
> static inline unsigned int folio_swap_flags(struct folio *folio)
> {
> @@ -89,7 +87,7 @@ static inline struct page *swap_cluster_readahead(swp_entry_t entry,
> }
>
> static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
> - struct vm_fault *vmf)
> + struct vm_fault *vmf, bool *swapcached)
> {
> return NULL;
> }
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 45dd8b7c195d..fd0047ae324e 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -316,6 +316,11 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
> release_pages(pages, nr);
> }
>
> +static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_t entry)
> +{
> + return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
> +}
> +
> static inline bool swap_use_vma_readahead(void)
> {
> return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
> @@ -861,8 +866,8 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
> * Returns the struct page for entry and addr after the swap entry is read
> * in.
> */
> -struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> - struct vm_fault *vmf)
> +static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> + struct vm_fault *vmf)
> {
> struct vm_area_struct *vma = vmf->vma;
> struct page *page = NULL;
> @@ -904,6 +909,8 @@ struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> * @entry: swap entry of this memory
> * @gfp_mask: memory allocation flags
> * @vmf: fault information
> + * @swapcached: pointer to a bool used as indicator if the
> + * page is swapped in through swapcache.
> *
> * Returns the struct page for entry and addr, after queueing swapin.
> *
> @@ -912,17 +919,29 @@ struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> * or vma-based(ie, virtual address based on faulty address) readahead.
> */
> struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
> - struct vm_fault *vmf)
> + struct vm_fault *vmf, bool *swapcached)
At this point the function name "swapin_readahead" does not match what
it does any more. Because readahead is just one of the behaviors it
does. It also can do without readahead. It needs a better name. This
function becomes a generic swapin_entry.
> {
> struct mempolicy *mpol;
> - pgoff_t ilx;
> struct page *page;
> + pgoff_t ilx;
> + bool cached;
>
> mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
> - page = swap_use_vma_readahead() ?
> - swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
> - swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> + if (swap_use_no_readahead(swp_swap_info(entry), entry)) {
> + page = swapin_no_readahead(entry, gfp_mask, vmf);
> + cached = false;
> + } else if (swap_use_vma_readahead()) {
> + page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
> + cached = true;
> + } else {
Notice that which flavor of swapin_read is actually a property of the
swap device.
For that device, it always calls the same swapin_entry function.
One idea is to have a VFS-like API for swap devices. This can be a
swapin_entry function callback from the swap_ops struct. Difference
swap devices just register different swapin_entry hooks. That way we
don't need to look at the device flags to decide what to do. We can
just call the VFS like swap_ops->swapin_entry(), the function pointer
will point to the right implementation. Then we don't need these three
levels if/else block. It is more flexible to register custom
implementations of swap layouts as well. Something to consider for the
future, you don't have to implement it in this series.
> + page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> + cached = true;
> + }
> mpol_cond_put(mpol);
> +
> + if (swapcached)
> + *swapcached = cached;
> +
> return page;
> }
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 756104ebd585..0142bfc71b81 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1874,7 +1874,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> };
>
> page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> - &vmf);
> + &vmf, NULL);
> if (page)
> folio = page_folio(page);
> }
> --
> 2.42.0
>
>
Chris Li <[email protected]> 于2023年11月21日周二 14:18写道:
>
> On Sun, Nov 19, 2023 at 11:48 AM Kairui Song <[email protected]> wrote:
> >
> > From: Kairui Song <[email protected]>
> >
> > This makes swapin_readahead a main entry for swapin pages,
> > prepare for optimizations in later commits.
> >
> > This also makes swapoff able to make use of readahead checking
> > based on entry. Swapping off a 10G ZRAM (lzo-rle) is faster:
> >
> > Before:
> > time swapoff /dev/zram0
> > real 0m12.337s
> > user 0m0.001s
> > sys 0m12.329s
> >
> > After:
> > time swapoff /dev/zram0
> > real 0m9.728s
> > user 0m0.001s
> > sys 0m9.719s
> >
> > And what's more, because now swapoff will also make use of no-readahead
> > swapin helper, this also fixed a bug for no-readahead case (eg. ZRAM):
> > when a process that swapped out some memory previously was moved to a new
> > cgroup, and the original cgroup is dead, swapoff the swap device will
> > make the swapped in pages accounted into the process doing the swapoff
> > instead of the new cgroup the process was moved to.
> >
> > This can be easily reproduced by:
> > - Setup a ramdisk (eg. ZRAM) swap.
> > - Create memory cgroup A, B and C.
> > - Spawn process P1 in cgroup A and make it swap out some pages.
> > - Move process P1 to memory cgroup B.
> > - Destroy cgroup A.
> > - Do a swapoff in cgroup C.
> > - Swapped in pages is accounted into cgroup C.
> >
> > This patch will fix it make the swapped in pages accounted in cgroup B.
>
> Can you help me understand where the code makes this behavior change?
> As far as I can tell, the patch just allows swap off to use the code
> path to avoid read ahead and avoid swap cache path. I haven't figured
> out where the code makes the swapin charge to B.
Yes, swapoff always call swapin_readahead to swapin pages. Before this
patch, the page allocate/charge path is like this:
(Here we assume there ia only a ZRAM device so VMA readahead is used)
swapoff
swapin_readahead
swap_vma_readahead
__read_swap_cache_async
alloc_pages_mpol
// *** Page charge happens here, and
// note the second argument is NULL
mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)
After the patch:
swapoff
swapin_readahead
swapin_no_readahead
vma_alloc_folio
// *** Page charge happens here, and
// note the second argument is vma->mm
mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)
In the previous callpath (swap_vma_readahead), the mm struct info is
not passed to the final allocation/charge.
But now swapin_no_readahead can simply pass the mm struct to the
allocation/charge.
mem_cgroup_swapin_charge_folio will take the memcg of the mm_struct as
the charge target when the entry memcg is not online.
> Does it need to be ZRAM? Will zswap or SSD work in your example?
The "swapoff moves memory charge out of a dying cgroup" issue exists
for all swap devices, just this patch changed the behavior for ZRAM
(which bypass swapcache and uses swapin_no_readahead).
>
> >
> > The same bug exists for readahead path too, we'll fix it in later
> > commits.
>
> As discussed in another email, this behavior change is against the
> current memcg memory charging model.
> Better separate out this behavior change with code clean up.
Good suggestion.
>
> >
> > Signed-off-by: Kairui Song <[email protected]>
> > ---
> > mm/memory.c | 22 +++++++---------------
> > mm/swap.h | 6 ++----
> > mm/swap_state.c | 33 ++++++++++++++++++++++++++-------
> > mm/swapfile.c | 2 +-
> > 4 files changed, 36 insertions(+), 27 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index fba4a5229163..f4237a2e3b93 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3792,6 +3792,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > rmap_t rmap_flags = RMAP_NONE;
> > bool exclusive = false;
> > swp_entry_t entry;
> > + bool swapcached;
> > pte_t pte;
> > vm_fault_t ret = 0;
> >
> > @@ -3855,22 +3856,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > swapcache = folio;
> >
> > if (!folio) {
> > - if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
> > - __swap_count(entry) == 1) {
> > - /* skip swapcache and readahead */
> > - page = swapin_no_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > - vmf);
> > - if (page)
> > - folio = page_folio(page);
> > + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > + vmf, &swapcached);
> > + if (page) {
> > + folio = page_folio(page);
> > + if (swapcached)
> > + swapcache = folio;
> > } else {
> > - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > - vmf);
> > - if (page)
> > - folio = page_folio(page);
> > - swapcache = folio;
> > - }
> > -
> > - if (!folio) {
> > /*
> > * Back out if somebody else faulted in this pte
> > * while we released the pte lock.
> > diff --git a/mm/swap.h b/mm/swap.h
> > index ea4be4791394..f82d43d7b52a 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -55,9 +55,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
> > struct mempolicy *mpol, pgoff_t ilx);
> > struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
> > - struct vm_fault *vmf);
> > -struct page *swapin_no_readahead(swp_entry_t entry, gfp_t flag,
> > - struct vm_fault *vmf);
> > + struct vm_fault *vmf, bool *swapcached);
> >
> > static inline unsigned int folio_swap_flags(struct folio *folio)
> > {
> > @@ -89,7 +87,7 @@ static inline struct page *swap_cluster_readahead(swp_entry_t entry,
> > }
> >
> > static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
> > - struct vm_fault *vmf)
> > + struct vm_fault *vmf, bool *swapcached)
> > {
> > return NULL;
> > }
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index 45dd8b7c195d..fd0047ae324e 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -316,6 +316,11 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
> > release_pages(pages, nr);
> > }
> >
> > +static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_t entry)
> > +{
> > + return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
> > +}
> > +
> > static inline bool swap_use_vma_readahead(void)
> > {
> > return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
> > @@ -861,8 +866,8 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
> > * Returns the struct page for entry and addr after the swap entry is read
> > * in.
> > */
> > -struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > - struct vm_fault *vmf)
> > +static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > + struct vm_fault *vmf)
> > {
> > struct vm_area_struct *vma = vmf->vma;
> > struct page *page = NULL;
> > @@ -904,6 +909,8 @@ struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > * @entry: swap entry of this memory
> > * @gfp_mask: memory allocation flags
> > * @vmf: fault information
> > + * @swapcached: pointer to a bool used as indicator if the
> > + * page is swapped in through swapcache.
> > *
> > * Returns the struct page for entry and addr, after queueing swapin.
> > *
> > @@ -912,17 +919,29 @@ struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > * or vma-based(ie, virtual address based on faulty address) readahead.
> > */
> > struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > - struct vm_fault *vmf)
> > + struct vm_fault *vmf, bool *swapcached)
>
> At this point the function name "swapin_readahead" does not match what
> it does any more. Because readahead is just one of the behaviors it
> does. It also can do without readahead. It needs a better name. This
> function becomes a generic swapin_entry.
I renamed this function in later commits, I can rename it here to
avoid confusion.
>
> > {
> > struct mempolicy *mpol;
> > - pgoff_t ilx;
> > struct page *page;
> > + pgoff_t ilx;
> > + bool cached;
> >
> > mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
> > - page = swap_use_vma_readahead() ?
> > - swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
> > - swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> > + if (swap_use_no_readahead(swp_swap_info(entry), entry)) {
> > + page = swapin_no_readahead(entry, gfp_mask, vmf);
> > + cached = false;
> > + } else if (swap_use_vma_readahead()) {
> > + page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
> > + cached = true;
> > + } else {
>
> Notice that which flavor of swapin_read is actually a property of the
> swap device.
> For that device, it always calls the same swapin_entry function.
>
> One idea is to have a VFS-like API for swap devices. This can be a
> swapin_entry function callback from the swap_ops struct. Difference
> swap devices just register different swapin_entry hooks. That way we
> don't need to look at the device flags to decide what to do. We can
> just call the VFS like swap_ops->swapin_entry(), the function pointer
> will point to the right implementation. Then we don't need these three
> levels if/else block. It is more flexible to register custom
> implementations of swap layouts as well. Something to consider for the
> future, you don't have to implement it in this series.
Interesting idea, we may look into this later.
>
> > + page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> > + cached = true;
> > + }
> > mpol_cond_put(mpol);
> > +
> > + if (swapcached)
> > + *swapcached = cached;
> > +
> > return page;
> > }
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 756104ebd585..0142bfc71b81 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -1874,7 +1874,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> > };
> >
> > page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > - &vmf);
> > + &vmf, NULL);
> > if (page)
> > folio = page_folio(page);
> > }
> > --
> > 2.42.0
> >
> >
Thanks!
On Mon, Nov 20, 2023 at 10:35 PM Kairui Song <[email protected]> wrote:
>
> Chris Li <[email protected]> 于2023年11月21日周二 14:18写道:
> >
> > On Sun, Nov 19, 2023 at 11:48 AM Kairui Song <[email protected]> wrote:
> > >
> > > From: Kairui Song <[email protected]>
> > >
> > > This makes swapin_readahead a main entry for swapin pages,
> > > prepare for optimizations in later commits.
> > >
> > > This also makes swapoff able to make use of readahead checking
> > > based on entry. Swapping off a 10G ZRAM (lzo-rle) is faster:
> > >
> > > Before:
> > > time swapoff /dev/zram0
> > > real 0m12.337s
> > > user 0m0.001s
> > > sys 0m12.329s
> > >
> > > After:
> > > time swapoff /dev/zram0
> > > real 0m9.728s
> > > user 0m0.001s
> > > sys 0m9.719s
> > >
> > > And what's more, because now swapoff will also make use of no-readahead
> > > swapin helper, this also fixed a bug for no-readahead case (eg. ZRAM):
> > > when a process that swapped out some memory previously was moved to a new
> > > cgroup, and the original cgroup is dead, swapoff the swap device will
> > > make the swapped in pages accounted into the process doing the swapoff
> > > instead of the new cgroup the process was moved to.
> > >
> > > This can be easily reproduced by:
> > > - Setup a ramdisk (eg. ZRAM) swap.
> > > - Create memory cgroup A, B and C.
> > > - Spawn process P1 in cgroup A and make it swap out some pages.
> > > - Move process P1 to memory cgroup B.
> > > - Destroy cgroup A.
> > > - Do a swapoff in cgroup C.
> > > - Swapped in pages is accounted into cgroup C.
In a strange way it makes sense to charge to C.
Swap out == free up memory.
Swap in == consume memory.
C turn off swap, effectively this behavior will consume a lot of memory.
C gets charged, so if the C is out of memory, it will punish C.
C will not be able to continue swap in memory. The problem gets under control.
> > >
> > > This patch will fix it make the swapped in pages accounted in cgroup B.
One problem I can see with your fix is that. C is the one doing the
bad deeds, causing consumption of system memory. Punish B is not going
to stop C continuing doing the bad deeds. If you let C run in B's
context limit, things get complicated very quickly.
> > Can you help me understand where the code makes this behavior change?
> > As far as I can tell, the patch just allows swap off to use the code
> > path to avoid read ahead and avoid swap cache path. I haven't figured
> > out where the code makes the swapin charge to B.
>
> Yes, swapoff always call swapin_readahead to swapin pages. Before this
> patch, the page allocate/charge path is like this:
> (Here we assume there ia only a ZRAM device so VMA readahead is used)
> swapoff
> swapin_readahead
> swap_vma_readahead
> __read_swap_cache_async
> alloc_pages_mpol
> // *** Page charge happens here, and
> // note the second argument is NULL
> mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)
>
> After the patch:
> swapoff
> swapin_readahead
> swapin_no_readahead
> vma_alloc_folio
> // *** Page charge happens here, and
> // note the second argument is vma->mm
> mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)
I see. Thanks for the detailed explanation. With that information, let
me go over your patch again.
> In the previous callpath (swap_vma_readahead), the mm struct info is
> not passed to the final allocation/charge.
> But now swapin_no_readahead can simply pass the mm struct to the
> allocation/charge.
>
> mem_cgroup_swapin_charge_folio will take the memcg of the mm_struct as
> the charge target when the entry memcg is not online.
>
> > Does it need to be ZRAM? Will zswap or SSD work in your example?
>
> The "swapoff moves memory charge out of a dying cgroup" issue exists
There is a whole class of zombie memcg issues the current memory
cgroup model can't handle well.
> for all swap devices, just this patch changed the behavior for ZRAM
> (which bypass swapcache and uses swapin_no_readahead).
Thanks for the clarification.
Chris
>
> >
> > >
> > > The same bug exists for readahead path too, we'll fix it in later
> > > commits.
> >
> > As discussed in another email, this behavior change is against the
> > current memcg memory charging model.
> > Better separate out this behavior change with code clean up.
>
> Good suggestion.
>
> >
> > >
> > > Signed-off-by: Kairui Song <[email protected]>
> > > ---
> > > mm/memory.c | 22 +++++++---------------
> > > mm/swap.h | 6 ++----
> > > mm/swap_state.c | 33 ++++++++++++++++++++++++++-------
> > > mm/swapfile.c | 2 +-
> > > 4 files changed, 36 insertions(+), 27 deletions(-)
> > >
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index fba4a5229163..f4237a2e3b93 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -3792,6 +3792,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > > rmap_t rmap_flags = RMAP_NONE;
> > > bool exclusive = false;
> > > swp_entry_t entry;
> > > + bool swapcached;
> > > pte_t pte;
> > > vm_fault_t ret = 0;
> > >
> > > @@ -3855,22 +3856,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > > swapcache = folio;
> > >
> > > if (!folio) {
> > > - if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
> > > - __swap_count(entry) == 1) {
> > > - /* skip swapcache and readahead */
> > > - page = swapin_no_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > > - vmf);
> > > - if (page)
> > > - folio = page_folio(page);
> > > + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > > + vmf, &swapcached);
> > > + if (page) {
> > > + folio = page_folio(page);
> > > + if (swapcached)
> > > + swapcache = folio;
> > > } else {
> > > - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > > - vmf);
> > > - if (page)
> > > - folio = page_folio(page);
> > > - swapcache = folio;
> > > - }
> > > -
> > > - if (!folio) {
> > > /*
> > > * Back out if somebody else faulted in this pte
> > > * while we released the pte lock.
> > > diff --git a/mm/swap.h b/mm/swap.h
> > > index ea4be4791394..f82d43d7b52a 100644
> > > --- a/mm/swap.h
> > > +++ b/mm/swap.h
> > > @@ -55,9 +55,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > > struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
> > > struct mempolicy *mpol, pgoff_t ilx);
> > > struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
> > > - struct vm_fault *vmf);
> > > -struct page *swapin_no_readahead(swp_entry_t entry, gfp_t flag,
> > > - struct vm_fault *vmf);
> > > + struct vm_fault *vmf, bool *swapcached);
> > >
> > > static inline unsigned int folio_swap_flags(struct folio *folio)
> > > {
> > > @@ -89,7 +87,7 @@ static inline struct page *swap_cluster_readahead(swp_entry_t entry,
> > > }
> > >
> > > static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
> > > - struct vm_fault *vmf)
> > > + struct vm_fault *vmf, bool *swapcached)
> > > {
> > > return NULL;
> > > }
> > > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > > index 45dd8b7c195d..fd0047ae324e 100644
> > > --- a/mm/swap_state.c
> > > +++ b/mm/swap_state.c
> > > @@ -316,6 +316,11 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
> > > release_pages(pages, nr);
> > > }
> > >
> > > +static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_t entry)
> > > +{
> > > + return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
> > > +}
> > > +
> > > static inline bool swap_use_vma_readahead(void)
> > > {
> > > return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
> > > @@ -861,8 +866,8 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
> > > * Returns the struct page for entry and addr after the swap entry is read
> > > * in.
> > > */
> > > -struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > > - struct vm_fault *vmf)
> > > +static struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > > + struct vm_fault *vmf)
> > > {
> > > struct vm_area_struct *vma = vmf->vma;
> > > struct page *page = NULL;
> > > @@ -904,6 +909,8 @@ struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > > * @entry: swap entry of this memory
> > > * @gfp_mask: memory allocation flags
> > > * @vmf: fault information
> > > + * @swapcached: pointer to a bool used as indicator if the
> > > + * page is swapped in through swapcache.
> > > *
> > > * Returns the struct page for entry and addr, after queueing swapin.
> > > *
> > > @@ -912,17 +919,29 @@ struct page *swapin_no_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > > * or vma-based(ie, virtual address based on faulty address) readahead.
> > > */
> > > struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > > - struct vm_fault *vmf)
> > > + struct vm_fault *vmf, bool *swapcached)
> >
> > At this point the function name "swapin_readahead" does not match what
> > it does any more. Because readahead is just one of the behaviors it
> > does. It also can do without readahead. It needs a better name. This
> > function becomes a generic swapin_entry.
>
> I renamed this function in later commits, I can rename it here to
> avoid confusion.
>
> >
> > > {
> > > struct mempolicy *mpol;
> > > - pgoff_t ilx;
> > > struct page *page;
> > > + pgoff_t ilx;
> > > + bool cached;
> > >
> > > mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
> > > - page = swap_use_vma_readahead() ?
> > > - swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
> > > - swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> > > + if (swap_use_no_readahead(swp_swap_info(entry), entry)) {
> > > + page = swapin_no_readahead(entry, gfp_mask, vmf);
> > > + cached = false;
> > > + } else if (swap_use_vma_readahead()) {
> > > + page = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
> > > + cached = true;
> > > + } else {
> >
> > Notice that which flavor of swapin_read is actually a property of the
> > swap device.
> > For that device, it always calls the same swapin_entry function.
> >
> > One idea is to have a VFS-like API for swap devices. This can be a
> > swapin_entry function callback from the swap_ops struct. Difference
> > swap devices just register different swapin_entry hooks. That way we
> > don't need to look at the device flags to decide what to do. We can
> > just call the VFS like swap_ops->swapin_entry(), the function pointer
> > will point to the right implementation. Then we don't need these three
> > levels if/else block. It is more flexible to register custom
> > implementations of swap layouts as well. Something to consider for the
> > future, you don't have to implement it in this series.
>
> Interesting idea, we may look into this later.
>
> >
> > > + page = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> > > + cached = true;
> > > + }
> > > mpol_cond_put(mpol);
> > > +
> > > + if (swapcached)
> > > + *swapcached = cached;
> > > +
> > > return page;
> > > }
> > >
> > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > index 756104ebd585..0142bfc71b81 100644
> > > --- a/mm/swapfile.c
> > > +++ b/mm/swapfile.c
> > > @@ -1874,7 +1874,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> > > };
> > >
> > > page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
> > > - &vmf);
> > > + &vmf, NULL);
> > > if (page)
> > > folio = page_folio(page);
> > > }
> > > --
> > > 2.42.0
> > >
> > >
>
> Thanks!
>
Chris Li <[email protected]> 于2023年11月21日周二 15:41写道:
>
> On Mon, Nov 20, 2023 at 10:35 PM Kairui Song <[email protected]> wrote:
> >
> > Chris Li <[email protected]> 于2023年11月21日周二 14:18写道:
> > >
> > > On Sun, Nov 19, 2023 at 11:48 AM Kairui Song <[email protected]> wrote:
> > > >
> > > > From: Kairui Song <[email protected]>
> > > >
> > > > This makes swapin_readahead a main entry for swapin pages,
> > > > prepare for optimizations in later commits.
> > > >
> > > > This also makes swapoff able to make use of readahead checking
> > > > based on entry. Swapping off a 10G ZRAM (lzo-rle) is faster:
> > > >
> > > > Before:
> > > > time swapoff /dev/zram0
> > > > real 0m12.337s
> > > > user 0m0.001s
> > > > sys 0m12.329s
> > > >
> > > > After:
> > > > time swapoff /dev/zram0
> > > > real 0m9.728s
> > > > user 0m0.001s
> > > > sys 0m9.719s
> > > >
> > > > And what's more, because now swapoff will also make use of no-readahead
> > > > swapin helper, this also fixed a bug for no-readahead case (eg. ZRAM):
> > > > when a process that swapped out some memory previously was moved to a new
> > > > cgroup, and the original cgroup is dead, swapoff the swap device will
> > > > make the swapped in pages accounted into the process doing the swapoff
> > > > instead of the new cgroup the process was moved to.
> > > >
> > > > This can be easily reproduced by:
> > > > - Setup a ramdisk (eg. ZRAM) swap.
> > > > - Create memory cgroup A, B and C.
> > > > - Spawn process P1 in cgroup A and make it swap out some pages.
> > > > - Move process P1 to memory cgroup B.
> > > > - Destroy cgroup A.
> > > > - Do a swapoff in cgroup C.
> > > > - Swapped in pages is accounted into cgroup C.
>
> In a strange way it makes sense to charge to C.
> Swap out == free up memory.
> Swap in == consume memory.
> C turn off swap, effectively this behavior will consume a lot of memory.
> C gets charged, so if the C is out of memory, it will punish C.
> C will not be able to continue swap in memory. The problem gets under control.
Yes, I think charging either C or B makes sense in their own way. To
me I think current behavior is kind of counter-intuitive.
Image if there are cgroup PC1, and its child cgroup CC1, CC2. If a process
swapped out some memory in CC1 then moved to CC2, and CC1 is dying.
On swapoff the charge will be moved out of PC1...
And swapoff often happens in some unlimited admin cgroup or some
cgroup for management agents.
If PC1 has a memory limit, the process in it can breach the limit easily,
we will see a process that never left PC1 having a much higher RSS
than PC1/CC1/CC2's limit.
And if there is a limit for the management agent cgroup, the agent
will be OOM instead of OOM in PC1.
Simply moving a process between the child cgroup of the same parent
cgroup won't cause a similar issue, things get weird when swapoff is
involved.
And actually with multiple layers of swap, it's less risky to swapoff
a device since other swap devices can catch over committed memory.
Oh, and there is one more case I forgot to cover in this series:
Moving a process is indeed something not happening very frequently,
but a process run in cgroup then exit, and leave some shmem swapped
out could be a common case.
Current behavior on swapoff will move these charges out of the
original parent cgroup too.
So maybe a more ideal solution for swapoff is: simply always charge a
dying cgroup parent cgroup?
Maybe a sysctl/cmdline could be introduced to control the behavior.
On Tue, Nov 21, 2023 at 12:33 AM Kairui Song <[email protected]> wrote:
> > In a strange way it makes sense to charge to C.
> > Swap out == free up memory.
> > Swap in == consume memory.
> > C turn off swap, effectively this behavior will consume a lot of memory.
> > C gets charged, so if the C is out of memory, it will punish C.
> > C will not be able to continue swap in memory. The problem gets under control.
>
> Yes, I think charging either C or B makes sense in their own way. To
> me I think current behavior is kind of counter-intuitive.
>
> Image if there are cgroup PC1, and its child cgroup CC1, CC2. If a process
> swapped out some memory in CC1 then moved to CC2, and CC1 is dying.
> On swapoff the charge will be moved out of PC1...
Yes. In the spirit of punishing the one that is actively doing bad
deeds, the swapoff charge move out of PC1 makes sense, because the
memory effectively is not used in PC1 any more.
The question is why do you want to move the process from CC1 to CC2?
Move process between CGroup is not something supported very well in
the current CGroup model. The memory already charged to the current
CGroup will not move with the process.
> And swapoff often happens in some unlimited admin cgroup or some
> cgroup for management agents.
>
> If PC1 has a memory limit, the process in it can breach the limit easily,
> we will see a process that never left PC1 having a much higher RSS
> than PC1/CC1/CC2's limit.
Hmm, how do you set the limit on PC1? Do you set a hard limit
"memory.max"? What does the "PC1/memory.stat" show?
If you have a script to reproduce this behavior, I would love to learn more.
> And if there is a limit for the management agent cgroup, the agent
> will be OOM instead of OOM in PC1.
It seems what you want to do is have a way for the admin agent to turn
off the swapping on PC1 and let PC1 OOM, right?
In other words, you want the admin agent to turn off swapping in PC1's context.
How about start a bash script, add itself to PC1/cgroup.procs, then
that bash script calls swap off for that PC1.
Will that solve your problem?
I am still not sure why you want to turn off swap for PC1? If you know
PC1 is going to OOM, why not just kill it? A little bit more of the
context of the problem will help me understand your usage case.
> Simply moving a process between the child cgroup of the same parent
> cgroup won't cause a similar issue, things get weird when swapoff is
> involved.
Again, having a reproducing script is very helpful for understanding
what is going on. It can serve as a regression testing tool.
> And actually with multiple layers of swap, it's less risky to swapoff
> a device since other swap devices can catch over committed memory.
Sure. Do you have any feedback if we have "memory.swap.tiers", will
that address your need to control swap usage for individual cgroup?
>
> Oh, and there is one more case I forgot to cover in this series:
> Moving a process is indeed something not happening very frequently,
> but a process run in cgroup then exit, and leave some shmem swapped
> out could be a common case.
That is the zombie memcg problem with shared memory. The current
cgroup does work well with shared memory objects. It is another can of
worms.
> Current behavior on swapoff will move these charges out of the
> original parent cgroup too.
>
> So maybe a more ideal solution for swapoff is: simply always charge a
> dying cgroup parent cgroup?
That is the recharging idea. In this year's LSFMM there is a
presentation about zombie memcgs. Recharging/reparenting is one of the
proposals. If I recall correctly, someone from SUSE made some comment
that they tried it in their kernel at one point. But it had a lot of
issues and the feature was removed from the kernel eventually.
I also made a proposal for a shared memory cgroup model as well. I can
dig it up if you are interested.
> Maybe a sysctl/cmdline could be introduced to control the behavior.
I am against this kind of one off behavior change without a consistent
model of charging the memory. Even behind a sysctl, it is some code we
need to maintain and test function properly.
If you want to change the memory charging model, I would like to see
what is the high level principle it follows so we can reason it in
other usage cases consistently. The least thing I want to see is to
make up rules as we go, then we might end up with contradicting rules
and back ourselves to a corner in the future.
I do care about your usage case, I just want to address it consistently.
My understanding of the current memory cgroup model is charge to the
first use. Punish the trouble maker.
BTW, I am still reviewing your 24 patches series, half way through. It
will take me a bit of time to write up the reply.
Chris