2014-01-21 03:07:50

by Cai Liu

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

Thanks for your review.

2014/1/21 Minchan Kim <[email protected]>:
> Hello Cai,
>
> On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
>> zswap can support multiple swapfiles. So we need to check
>> all zbud pool pages in zswap.
>>
>> Version 2:
>> * add *total_zbud_pages* in zbud to record all the pages in pools
>> * move the updating of pool pages statistics to
>> alloc_zbud_page/free_zbud_page to hide the details
>>
>> Signed-off-by: Cai Liu <[email protected]>
>> ---
>> include/linux/zbud.h | 2 +-
>> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
>> mm/zswap.c | 4 ++--
>> 3 files changed, 35 insertions(+), 15 deletions(-)
>>
>> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
>> index 2571a5c..1dbc13e 100644
>> --- a/include/linux/zbud.h
>> +++ b/include/linux/zbud.h
>> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
>> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
>> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
>> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
>> -u64 zbud_get_pool_size(struct zbud_pool *pool);
>> +u64 zbud_get_pool_size(void);
>>
>> #endif /* _ZBUD_H_ */
>> diff --git a/mm/zbud.c b/mm/zbud.c
>> index 9451361..711aaf4 100644
>> --- a/mm/zbud.c
>> +++ b/mm/zbud.c
>> @@ -52,6 +52,13 @@
>> #include <linux/spinlock.h>
>> #include <linux/zbud.h>
>>
>> +/*********************************
>> +* statistics
>> +**********************************/
>> +
>> +/* zbud pages in all pools */
>> +static u64 total_zbud_pages;
>> +
>> /*****************
>> * Structures
>> *****************/
>> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
>> return zhdr;
>> }
>>
>> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
>> +{
>> + struct page *page;
>> +
>> + page = alloc_page(gfp);
>> +
>> + if (page) {
>> + pool->pages_nr++;
>> + total_zbud_pages++;
>
> Who protect race?

Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
I will re-do it.

I will change *total_zbud_pages* to atomic type.
For *pool->pages_nr*, one way is to use pool->lock to protect. But I
think it is too heavy.
So does it ok to change pages_nr to atomic type too?


>
>> + }
>> +
>> + return page;
>> +}
>> +
>> +
>> /* Resets the struct page fields and frees the page */
>> -static void free_zbud_page(struct zbud_header *zhdr)
>> +static void free_zbud_page(struct zbud_pool *pool, struct zbud_header *zhdr)
>> {
>> __free_page(virt_to_page(zhdr));
>> +
>> + pool->pages_nr--;
>> + total_zbud_pages--;
>> }
>>
>> /*
>> @@ -279,11 +304,10 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
>>
>> /* Couldn't find unbuddied zbud page, create new one */
>> spin_unlock(&pool->lock);
>> - page = alloc_page(gfp);
>> + page = alloc_zbud_page(pool, gfp);
>> if (!page)
>> return -ENOMEM;
>> spin_lock(&pool->lock);
>> - pool->pages_nr++;
>> zhdr = init_zbud_page(page);
>> bud = FIRST;
>>
>> @@ -349,8 +373,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
>> if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
>> /* zbud page is empty, free */
>> list_del(&zhdr->lru);
>> - free_zbud_page(zhdr);
>> - pool->pages_nr--;
>> + free_zbud_page(pool, zhdr);
>> } else {
>> /* Add to unbuddied list */
>> freechunks = num_free_chunks(zhdr);
>> @@ -447,8 +470,7 @@ next:
>> * Both buddies are now free, free the zbud page and
>> * return success.
>> */
>> - free_zbud_page(zhdr);
>> - pool->pages_nr--;
>> + free_zbud_page(pool, zhdr);
>> spin_unlock(&pool->lock);
>> return 0;
>> } else if (zhdr->first_chunks == 0 ||
>> @@ -496,14 +518,12 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
>>
>> /**
>> * zbud_get_pool_size() - gets the zbud pool size in pages
>> - * @pool: pool whose size is being queried
>> *
>> - * Returns: size in pages of the given pool. The pool lock need not be
>> - * taken to access pages_nr.
>> + * Returns: size in pages of all the zbud pools.
>> */
>> -u64 zbud_get_pool_size(struct zbud_pool *pool)
>> +u64 zbud_get_pool_size(void)
>> {
>> - return pool->pages_nr;
>> + return total_zbud_pages;
>> }
>>
>> static int __init init_zbud(void)
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 5a63f78..ef44d9d 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -291,7 +291,7 @@ static void zswap_free_entry(struct zswap_tree *tree,
>> zbud_free(tree->pool, entry->handle);
>> zswap_entry_cache_free(entry);
>> atomic_dec(&zswap_stored_pages);
>> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>> + zswap_pool_pages = zbud_get_pool_size();
>> }
>>
>> /* caller must hold the tree lock */
>> @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>>
>> /* update stats */
>> atomic_inc(&zswap_stored_pages);
>> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>> + zswap_pool_pages = zbud_get_pool_size();
>>
>> return 0;
>>
>> --
>> 1.7.10.4
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to [email protected]. For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
> --
> Kind regards,
> Minchan Kim


2014-01-21 03:58:55

by Bob Liu

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

On Tue, Jan 21, 2014 at 11:07 AM, Cai Liu <[email protected]> wrote:
> Thanks for your review.
>
> 2014/1/21 Minchan Kim <[email protected]>:
>> Hello Cai,
>>
>> On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
>>> zswap can support multiple swapfiles. So we need to check
>>> all zbud pool pages in zswap.
>>>
>>> Version 2:
>>> * add *total_zbud_pages* in zbud to record all the pages in pools
>>> * move the updating of pool pages statistics to
>>> alloc_zbud_page/free_zbud_page to hide the details
>>>
>>> Signed-off-by: Cai Liu <[email protected]>
>>> ---
>>> include/linux/zbud.h | 2 +-
>>> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
>>> mm/zswap.c | 4 ++--
>>> 3 files changed, 35 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
>>> index 2571a5c..1dbc13e 100644
>>> --- a/include/linux/zbud.h
>>> +++ b/include/linux/zbud.h
>>> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
>>> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
>>> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
>>> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
>>> -u64 zbud_get_pool_size(struct zbud_pool *pool);
>>> +u64 zbud_get_pool_size(void);
>>>
>>> #endif /* _ZBUD_H_ */
>>> diff --git a/mm/zbud.c b/mm/zbud.c
>>> index 9451361..711aaf4 100644
>>> --- a/mm/zbud.c
>>> +++ b/mm/zbud.c
>>> @@ -52,6 +52,13 @@
>>> #include <linux/spinlock.h>
>>> #include <linux/zbud.h>
>>>
>>> +/*********************************
>>> +* statistics
>>> +**********************************/
>>> +
>>> +/* zbud pages in all pools */
>>> +static u64 total_zbud_pages;
>>> +
>>> /*****************
>>> * Structures
>>> *****************/
>>> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
>>> return zhdr;
>>> }
>>>
>>> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
>>> +{
>>> + struct page *page;
>>> +
>>> + page = alloc_page(gfp);
>>> +
>>> + if (page) {
>>> + pool->pages_nr++;
>>> + total_zbud_pages++;
>>
>> Who protect race?
>
> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
> I will re-do it.
>
> I will change *total_zbud_pages* to atomic type.

And how about just add total_zbud_pages++ and leave pool->pages_nr in
its original place which already protected by pool->lock?

> For *pool->pages_nr*, one way is to use pool->lock to protect. But I
> think it is too heavy.
> So does it ok to change pages_nr to atomic type too?
>

--
Regards,
--Bob

2014-01-21 05:03:41

by Minchan Kim

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

Please check your MUA and don't break thread.

On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
> Thanks for your review.
>
> 2014/1/21 Minchan Kim <[email protected]>:
> > Hello Cai,
> >
> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
> >> zswap can support multiple swapfiles. So we need to check
> >> all zbud pool pages in zswap.
> >>
> >> Version 2:
> >> * add *total_zbud_pages* in zbud to record all the pages in pools
> >> * move the updating of pool pages statistics to
> >> alloc_zbud_page/free_zbud_page to hide the details
> >>
> >> Signed-off-by: Cai Liu <[email protected]>
> >> ---
> >> include/linux/zbud.h | 2 +-
> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
> >> mm/zswap.c | 4 ++--
> >> 3 files changed, 35 insertions(+), 15 deletions(-)
> >>
> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
> >> index 2571a5c..1dbc13e 100644
> >> --- a/include/linux/zbud.h
> >> +++ b/include/linux/zbud.h
> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
> >> +u64 zbud_get_pool_size(void);
> >>
> >> #endif /* _ZBUD_H_ */
> >> diff --git a/mm/zbud.c b/mm/zbud.c
> >> index 9451361..711aaf4 100644
> >> --- a/mm/zbud.c
> >> +++ b/mm/zbud.c
> >> @@ -52,6 +52,13 @@
> >> #include <linux/spinlock.h>
> >> #include <linux/zbud.h>
> >>
> >> +/*********************************
> >> +* statistics
> >> +**********************************/
> >> +
> >> +/* zbud pages in all pools */
> >> +static u64 total_zbud_pages;
> >> +
> >> /*****************
> >> * Structures
> >> *****************/
> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
> >> return zhdr;
> >> }
> >>
> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
> >> +{
> >> + struct page *page;
> >> +
> >> + page = alloc_page(gfp);
> >> +
> >> + if (page) {
> >> + pool->pages_nr++;
> >> + total_zbud_pages++;
> >
> > Who protect race?
>
> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
> I will re-do it.
>
> I will change *total_zbud_pages* to atomic type.

Wait, it doesn't make sense. Now, you assume zbud allocator would be used
for only zswap. It's true until now but we couldn't make sure it in future.
If other user start to use zbud allocator, total_zbud_pages would be pointless.

Another concern is that what's your scenario for above two swap?
How often we need to call zbud_get_pool_size?
In previous your patch, you reduced the number of call so IIRC,
we only called it in zswap_is_full and for debugfs.
Of course, it would need some lock or refcount to prevent destroy
of zswap_tree in parallel with zswap_frontswap_invalidate_area but
zswap_is_full doesn't need to be exact so RCU would be good fit.

Most important point is that now zswap doesn't consider multiple swap.
For example, Let's assume you uses two swap A and B with different priority
and A already has charged 19% long time ago and let's assume that A swap is
full now so VM start to use B so that B has charged 1% recently.
It menas zswap charged (19% + 1%)i is full by default.

Then, if VM want to swap out more pages into B, zbud_reclaim_page
would be evict one of pages in B's pool and it would be repeated
continuously. It's totally LRU reverse problem and swap thrashing in B
would happen.

Please say your usecase scenario and if it's really problem,
we need more surgery.

Thanks.

> For *pool->pages_nr*, one way is to use pool->lock to protect. But I
> think it is too heavy.
> So does it ok to change pages_nr to atomic type too?
>
>
> >
> >> + }
> >> +
> >> + return page;
> >> +}
> >> +
> >> +
> >> /* Resets the struct page fields and frees the page */
> >> -static void free_zbud_page(struct zbud_header *zhdr)
> >> +static void free_zbud_page(struct zbud_pool *pool, struct zbud_header *zhdr)
> >> {
> >> __free_page(virt_to_page(zhdr));
> >> +
> >> + pool->pages_nr--;
> >> + total_zbud_pages--;
> >> }
> >>
> >> /*
> >> @@ -279,11 +304,10 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
> >>
> >> /* Couldn't find unbuddied zbud page, create new one */
> >> spin_unlock(&pool->lock);
> >> - page = alloc_page(gfp);
> >> + page = alloc_zbud_page(pool, gfp);
> >> if (!page)
> >> return -ENOMEM;
> >> spin_lock(&pool->lock);
> >> - pool->pages_nr++;
> >> zhdr = init_zbud_page(page);
> >> bud = FIRST;
> >>
> >> @@ -349,8 +373,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
> >> if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
> >> /* zbud page is empty, free */
> >> list_del(&zhdr->lru);
> >> - free_zbud_page(zhdr);
> >> - pool->pages_nr--;
> >> + free_zbud_page(pool, zhdr);
> >> } else {
> >> /* Add to unbuddied list */
> >> freechunks = num_free_chunks(zhdr);
> >> @@ -447,8 +470,7 @@ next:
> >> * Both buddies are now free, free the zbud page and
> >> * return success.
> >> */
> >> - free_zbud_page(zhdr);
> >> - pool->pages_nr--;
> >> + free_zbud_page(pool, zhdr);
> >> spin_unlock(&pool->lock);
> >> return 0;
> >> } else if (zhdr->first_chunks == 0 ||
> >> @@ -496,14 +518,12 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
> >>
> >> /**
> >> * zbud_get_pool_size() - gets the zbud pool size in pages
> >> - * @pool: pool whose size is being queried
> >> *
> >> - * Returns: size in pages of the given pool. The pool lock need not be
> >> - * taken to access pages_nr.
> >> + * Returns: size in pages of all the zbud pools.
> >> */
> >> -u64 zbud_get_pool_size(struct zbud_pool *pool)
> >> +u64 zbud_get_pool_size(void)
> >> {
> >> - return pool->pages_nr;
> >> + return total_zbud_pages;
> >> }
> >>
> >> static int __init init_zbud(void)
> >> diff --git a/mm/zswap.c b/mm/zswap.c
> >> index 5a63f78..ef44d9d 100644
> >> --- a/mm/zswap.c
> >> +++ b/mm/zswap.c
> >> @@ -291,7 +291,7 @@ static void zswap_free_entry(struct zswap_tree *tree,
> >> zbud_free(tree->pool, entry->handle);
> >> zswap_entry_cache_free(entry);
> >> atomic_dec(&zswap_stored_pages);
> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
> >> + zswap_pool_pages = zbud_get_pool_size();
> >> }
> >>
> >> /* caller must hold the tree lock */
> >> @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> >>
> >> /* update stats */
> >> atomic_inc(&zswap_stored_pages);
> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
> >> + zswap_pool_pages = zbud_get_pool_size();
> >>
> >> return 0;
> >>
> >> --
> >> 1.7.10.4
> >>
> >> --
> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >> the body to [email protected]. For more info on Linux MM,
> >> see: http://www.linux-mm.org/ .
> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> >
> > --
> > Kind regards,
> > Minchan Kim
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Kind regards,
Minchan Kim

2014-01-21 06:35:14

by Cai Liu

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

2014/1/21 Minchan Kim <[email protected]>:
> Please check your MUA and don't break thread.
>
> On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
>> Thanks for your review.
>>
>> 2014/1/21 Minchan Kim <[email protected]>:
>> > Hello Cai,
>> >
>> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
>> >> zswap can support multiple swapfiles. So we need to check
>> >> all zbud pool pages in zswap.
>> >>
>> >> Version 2:
>> >> * add *total_zbud_pages* in zbud to record all the pages in pools
>> >> * move the updating of pool pages statistics to
>> >> alloc_zbud_page/free_zbud_page to hide the details
>> >>
>> >> Signed-off-by: Cai Liu <[email protected]>
>> >> ---
>> >> include/linux/zbud.h | 2 +-
>> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
>> >> mm/zswap.c | 4 ++--
>> >> 3 files changed, 35 insertions(+), 15 deletions(-)
>> >>
>> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
>> >> index 2571a5c..1dbc13e 100644
>> >> --- a/include/linux/zbud.h
>> >> +++ b/include/linux/zbud.h
>> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
>> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
>> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
>> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
>> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
>> >> +u64 zbud_get_pool_size(void);
>> >>
>> >> #endif /* _ZBUD_H_ */
>> >> diff --git a/mm/zbud.c b/mm/zbud.c
>> >> index 9451361..711aaf4 100644
>> >> --- a/mm/zbud.c
>> >> +++ b/mm/zbud.c
>> >> @@ -52,6 +52,13 @@
>> >> #include <linux/spinlock.h>
>> >> #include <linux/zbud.h>
>> >>
>> >> +/*********************************
>> >> +* statistics
>> >> +**********************************/
>> >> +
>> >> +/* zbud pages in all pools */
>> >> +static u64 total_zbud_pages;
>> >> +
>> >> /*****************
>> >> * Structures
>> >> *****************/
>> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
>> >> return zhdr;
>> >> }
>> >>
>> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
>> >> +{
>> >> + struct page *page;
>> >> +
>> >> + page = alloc_page(gfp);
>> >> +
>> >> + if (page) {
>> >> + pool->pages_nr++;
>> >> + total_zbud_pages++;
>> >
>> > Who protect race?
>>
>> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
>> I will re-do it.
>>
>> I will change *total_zbud_pages* to atomic type.
>
> Wait, it doesn't make sense. Now, you assume zbud allocator would be used
> for only zswap. It's true until now but we couldn't make sure it in future.
> If other user start to use zbud allocator, total_zbud_pages would be pointless.

Yes, you are right. ZBUD is a common module. So in this patch calculate the
zswap pool size in zbud is not suitable.

>
> Another concern is that what's your scenario for above two swap?
> How often we need to call zbud_get_pool_size?
> In previous your patch, you reduced the number of call so IIRC,
> we only called it in zswap_is_full and for debugfs.

zbud_get_pool_size() is called frequently when adding/freeing zswap
entry happen in zswap . This is why in this patch I added a counter in zbud,
and then in zswap the iteration of zswap_list to calculate the pool size will
not be needed.

> Of course, it would need some lock or refcount to prevent destroy
> of zswap_tree in parallel with zswap_frontswap_invalidate_area but
> zswap_is_full doesn't need to be exact so RCU would be good fit.
>
> Most important point is that now zswap doesn't consider multiple swap.
> For example, Let's assume you uses two swap A and B with different priority
> and A already has charged 19% long time ago and let's assume that A swap is
> full now so VM start to use B so that B has charged 1% recently.
> It menas zswap charged (19% + 1%)i is full by default.
>
> Then, if VM want to swap out more pages into B, zbud_reclaim_page
> would be evict one of pages in B's pool and it would be repeated
> continuously. It's totally LRU reverse problem and swap thrashing in B
> would happen.
>

The scenario is below:
There are 2 swap A, B in system. If pool size of A reach 19% of ram
size and swap A
is also full. Then swap B will be used. Pool size of B will be
increased until it hit
the 20% of the ram size. By now zswap pool size is about 39% of ram size.
If there are more than 2 swap file/device, zswap pool will expand out
of control
and there may be no swapout happened.

I think the original intention of zswap designer is to keep the total
zswap pools size below
20% of RAM size.

Thanks.

> Please say your usecase scenario and if it's really problem,
> we need more surgery.
>
> Thanks.
>
>> For *pool->pages_nr*, one way is to use pool->lock to protect. But I
>> think it is too heavy.
>> So does it ok to change pages_nr to atomic type too?
>>
>>
>> >
>> >> + }
>> >> +
>> >> + return page;
>> >> +}
>> >> +
>> >> +
>> >> /* Resets the struct page fields and frees the page */
>> >> -static void free_zbud_page(struct zbud_header *zhdr)
>> >> +static void free_zbud_page(struct zbud_pool *pool, struct zbud_header *zhdr)
>> >> {
>> >> __free_page(virt_to_page(zhdr));
>> >> +
>> >> + pool->pages_nr--;
>> >> + total_zbud_pages--;
>> >> }
>> >>
>> >> /*
>> >> @@ -279,11 +304,10 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
>> >>
>> >> /* Couldn't find unbuddied zbud page, create new one */
>> >> spin_unlock(&pool->lock);
>> >> - page = alloc_page(gfp);
>> >> + page = alloc_zbud_page(pool, gfp);
>> >> if (!page)
>> >> return -ENOMEM;
>> >> spin_lock(&pool->lock);
>> >> - pool->pages_nr++;
>> >> zhdr = init_zbud_page(page);
>> >> bud = FIRST;
>> >>
>> >> @@ -349,8 +373,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
>> >> if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
>> >> /* zbud page is empty, free */
>> >> list_del(&zhdr->lru);
>> >> - free_zbud_page(zhdr);
>> >> - pool->pages_nr--;
>> >> + free_zbud_page(pool, zhdr);
>> >> } else {
>> >> /* Add to unbuddied list */
>> >> freechunks = num_free_chunks(zhdr);
>> >> @@ -447,8 +470,7 @@ next:
>> >> * Both buddies are now free, free the zbud page and
>> >> * return success.
>> >> */
>> >> - free_zbud_page(zhdr);
>> >> - pool->pages_nr--;
>> >> + free_zbud_page(pool, zhdr);
>> >> spin_unlock(&pool->lock);
>> >> return 0;
>> >> } else if (zhdr->first_chunks == 0 ||
>> >> @@ -496,14 +518,12 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
>> >>
>> >> /**
>> >> * zbud_get_pool_size() - gets the zbud pool size in pages
>> >> - * @pool: pool whose size is being queried
>> >> *
>> >> - * Returns: size in pages of the given pool. The pool lock need not be
>> >> - * taken to access pages_nr.
>> >> + * Returns: size in pages of all the zbud pools.
>> >> */
>> >> -u64 zbud_get_pool_size(struct zbud_pool *pool)
>> >> +u64 zbud_get_pool_size(void)
>> >> {
>> >> - return pool->pages_nr;
>> >> + return total_zbud_pages;
>> >> }
>> >>
>> >> static int __init init_zbud(void)
>> >> diff --git a/mm/zswap.c b/mm/zswap.c
>> >> index 5a63f78..ef44d9d 100644
>> >> --- a/mm/zswap.c
>> >> +++ b/mm/zswap.c
>> >> @@ -291,7 +291,7 @@ static void zswap_free_entry(struct zswap_tree *tree,
>> >> zbud_free(tree->pool, entry->handle);
>> >> zswap_entry_cache_free(entry);
>> >> atomic_dec(&zswap_stored_pages);
>> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>> >> + zswap_pool_pages = zbud_get_pool_size();
>> >> }
>> >>
>> >> /* caller must hold the tree lock */
>> >> @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>> >>
>> >> /* update stats */
>> >> atomic_inc(&zswap_stored_pages);
>> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>> >> + zswap_pool_pages = zbud_get_pool_size();
>> >>
>> >> return 0;
>> >>
>> >> --
>> >> 1.7.10.4
>> >>
>> >> --
>> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> >> the body to [email protected]. For more info on Linux MM,
>> >> see: http://www.linux-mm.org/ .
>> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>> >
>> > --
>> > Kind regards,
>> > Minchan Kim
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to [email protected]. For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
> --
> Kind regards,
> Minchan Kim

2014-01-21 08:17:16

by Minchan Kim

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

Hello,

On Tue, Jan 21, 2014 at 02:35:07PM +0800, Cai Liu wrote:
> 2014/1/21 Minchan Kim <[email protected]>:
> > Please check your MUA and don't break thread.
> >
> > On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
> >> Thanks for your review.
> >>
> >> 2014/1/21 Minchan Kim <[email protected]>:
> >> > Hello Cai,
> >> >
> >> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
> >> >> zswap can support multiple swapfiles. So we need to check
> >> >> all zbud pool pages in zswap.
> >> >>
> >> >> Version 2:
> >> >> * add *total_zbud_pages* in zbud to record all the pages in pools
> >> >> * move the updating of pool pages statistics to
> >> >> alloc_zbud_page/free_zbud_page to hide the details
> >> >>
> >> >> Signed-off-by: Cai Liu <[email protected]>
> >> >> ---
> >> >> include/linux/zbud.h | 2 +-
> >> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
> >> >> mm/zswap.c | 4 ++--
> >> >> 3 files changed, 35 insertions(+), 15 deletions(-)
> >> >>
> >> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
> >> >> index 2571a5c..1dbc13e 100644
> >> >> --- a/include/linux/zbud.h
> >> >> +++ b/include/linux/zbud.h
> >> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
> >> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
> >> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
> >> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
> >> >> +u64 zbud_get_pool_size(void);
> >> >>
> >> >> #endif /* _ZBUD_H_ */
> >> >> diff --git a/mm/zbud.c b/mm/zbud.c
> >> >> index 9451361..711aaf4 100644
> >> >> --- a/mm/zbud.c
> >> >> +++ b/mm/zbud.c
> >> >> @@ -52,6 +52,13 @@
> >> >> #include <linux/spinlock.h>
> >> >> #include <linux/zbud.h>
> >> >>
> >> >> +/*********************************
> >> >> +* statistics
> >> >> +**********************************/
> >> >> +
> >> >> +/* zbud pages in all pools */
> >> >> +static u64 total_zbud_pages;
> >> >> +
> >> >> /*****************
> >> >> * Structures
> >> >> *****************/
> >> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
> >> >> return zhdr;
> >> >> }
> >> >>
> >> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
> >> >> +{
> >> >> + struct page *page;
> >> >> +
> >> >> + page = alloc_page(gfp);
> >> >> +
> >> >> + if (page) {
> >> >> + pool->pages_nr++;
> >> >> + total_zbud_pages++;
> >> >
> >> > Who protect race?
> >>
> >> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
> >> I will re-do it.
> >>
> >> I will change *total_zbud_pages* to atomic type.
> >
> > Wait, it doesn't make sense. Now, you assume zbud allocator would be used
> > for only zswap. It's true until now but we couldn't make sure it in future.
> > If other user start to use zbud allocator, total_zbud_pages would be pointless.
>
> Yes, you are right. ZBUD is a common module. So in this patch calculate the
> zswap pool size in zbud is not suitable.
>
> >
> > Another concern is that what's your scenario for above two swap?
> > How often we need to call zbud_get_pool_size?
> > In previous your patch, you reduced the number of call so IIRC,
> > we only called it in zswap_is_full and for debugfs.
>
> zbud_get_pool_size() is called frequently when adding/freeing zswap
> entry happen in zswap . This is why in this patch I added a counter in zbud,
> and then in zswap the iteration of zswap_list to calculate the pool size will
> not be needed.

We can remove updating zswap_pool_pages in zswap_frontswap_store and
zswap_free_entry as I said. So zswap_is_full is only hot spot.
Do you think it's still big overhead? Why? Maybe locking to prevent
destroying? Then, we can use RCU to minimize the overhead as I mentioned.

>
> > Of course, it would need some lock or refcount to prevent destroy
> > of zswap_tree in parallel with zswap_frontswap_invalidate_area but
> > zswap_is_full doesn't need to be exact so RCU would be good fit.
> >
> > Most important point is that now zswap doesn't consider multiple swap.
> > For example, Let's assume you uses two swap A and B with different priority
> > and A already has charged 19% long time ago and let's assume that A swap is
> > full now so VM start to use B so that B has charged 1% recently.
> > It menas zswap charged (19% + 1%)i is full by default.
> >
> > Then, if VM want to swap out more pages into B, zbud_reclaim_page
> > would be evict one of pages in B's pool and it would be repeated
> > continuously. It's totally LRU reverse problem and swap thrashing in B
> > would happen.
> >
>
> The scenario is below:
> There are 2 swap A, B in system. If pool size of A reach 19% of ram
> size and swap A
> is also full. Then swap B will be used. Pool size of B will be
> increased until it hit
> the 20% of the ram size. By now zswap pool size is about 39% of ram size.
> If there are more than 2 swap file/device, zswap pool will expand out
> of control
> and there may be no swapout happened.

I know.

>
> I think the original intention of zswap designer is to keep the total
> zswap pools size below
> 20% of RAM size.

My point is your patch still doesn't solve the example I mentioned.

>
> Thanks.
>
> > Please say your usecase scenario and if it's really problem,
> > we need more surgery.
> >
> > Thanks.
> >
> >> For *pool->pages_nr*, one way is to use pool->lock to protect. But I
> >> think it is too heavy.
> >> So does it ok to change pages_nr to atomic type too?
> >>
> >>
> >> >
> >> >> + }
> >> >> +
> >> >> + return page;
> >> >> +}
> >> >> +
> >> >> +
> >> >> /* Resets the struct page fields and frees the page */
> >> >> -static void free_zbud_page(struct zbud_header *zhdr)
> >> >> +static void free_zbud_page(struct zbud_pool *pool, struct zbud_header *zhdr)
> >> >> {
> >> >> __free_page(virt_to_page(zhdr));
> >> >> +
> >> >> + pool->pages_nr--;
> >> >> + total_zbud_pages--;
> >> >> }
> >> >>
> >> >> /*
> >> >> @@ -279,11 +304,10 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
> >> >>
> >> >> /* Couldn't find unbuddied zbud page, create new one */
> >> >> spin_unlock(&pool->lock);
> >> >> - page = alloc_page(gfp);
> >> >> + page = alloc_zbud_page(pool, gfp);
> >> >> if (!page)
> >> >> return -ENOMEM;
> >> >> spin_lock(&pool->lock);
> >> >> - pool->pages_nr++;
> >> >> zhdr = init_zbud_page(page);
> >> >> bud = FIRST;
> >> >>
> >> >> @@ -349,8 +373,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
> >> >> if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
> >> >> /* zbud page is empty, free */
> >> >> list_del(&zhdr->lru);
> >> >> - free_zbud_page(zhdr);
> >> >> - pool->pages_nr--;
> >> >> + free_zbud_page(pool, zhdr);
> >> >> } else {
> >> >> /* Add to unbuddied list */
> >> >> freechunks = num_free_chunks(zhdr);
> >> >> @@ -447,8 +470,7 @@ next:
> >> >> * Both buddies are now free, free the zbud page and
> >> >> * return success.
> >> >> */
> >> >> - free_zbud_page(zhdr);
> >> >> - pool->pages_nr--;
> >> >> + free_zbud_page(pool, zhdr);
> >> >> spin_unlock(&pool->lock);
> >> >> return 0;
> >> >> } else if (zhdr->first_chunks == 0 ||
> >> >> @@ -496,14 +518,12 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
> >> >>
> >> >> /**
> >> >> * zbud_get_pool_size() - gets the zbud pool size in pages
> >> >> - * @pool: pool whose size is being queried
> >> >> *
> >> >> - * Returns: size in pages of the given pool. The pool lock need not be
> >> >> - * taken to access pages_nr.
> >> >> + * Returns: size in pages of all the zbud pools.
> >> >> */
> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool)
> >> >> +u64 zbud_get_pool_size(void)
> >> >> {
> >> >> - return pool->pages_nr;
> >> >> + return total_zbud_pages;
> >> >> }
> >> >>
> >> >> static int __init init_zbud(void)
> >> >> diff --git a/mm/zswap.c b/mm/zswap.c
> >> >> index 5a63f78..ef44d9d 100644
> >> >> --- a/mm/zswap.c
> >> >> +++ b/mm/zswap.c
> >> >> @@ -291,7 +291,7 @@ static void zswap_free_entry(struct zswap_tree *tree,
> >> >> zbud_free(tree->pool, entry->handle);
> >> >> zswap_entry_cache_free(entry);
> >> >> atomic_dec(&zswap_stored_pages);
> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
> >> >> + zswap_pool_pages = zbud_get_pool_size();
> >> >> }
> >> >>
> >> >> /* caller must hold the tree lock */
> >> >> @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> >> >>
> >> >> /* update stats */
> >> >> atomic_inc(&zswap_stored_pages);
> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
> >> >> + zswap_pool_pages = zbud_get_pool_size();
> >> >>
> >> >> return 0;
> >> >>
> >> >> --
> >> >> 1.7.10.4
> >> >>
> >> >> --
> >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >> >> the body to [email protected]. For more info on Linux MM,
> >> >> see: http://www.linux-mm.org/ .
> >> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> >> >
> >> > --
> >> > Kind regards,
> >> > Minchan Kim
> >>
> >> --
> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >> the body to [email protected]. For more info on Linux MM,
> >> see: http://www.linux-mm.org/ .
> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> >
> > --
> > Kind regards,
> > Minchan Kim
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Kind regards,
Minchan Kim

2014-01-21 13:52:30

by Cai Liu

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

Hello Minchan

2014/1/21 Minchan Kim <[email protected]>:
> Hello,
>
> On Tue, Jan 21, 2014 at 02:35:07PM +0800, Cai Liu wrote:
>> 2014/1/21 Minchan Kim <[email protected]>:
>> > Please check your MUA and don't break thread.
>> >
>> > On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
>> >> Thanks for your review.
>> >>
>> >> 2014/1/21 Minchan Kim <[email protected]>:
>> >> > Hello Cai,
>> >> >
>> >> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
>> >> >> zswap can support multiple swapfiles. So we need to check
>> >> >> all zbud pool pages in zswap.
>> >> >>
>> >> >> Version 2:
>> >> >> * add *total_zbud_pages* in zbud to record all the pages in pools
>> >> >> * move the updating of pool pages statistics to
>> >> >> alloc_zbud_page/free_zbud_page to hide the details
>> >> >>
>> >> >> Signed-off-by: Cai Liu <[email protected]>
>> >> >> ---
>> >> >> include/linux/zbud.h | 2 +-
>> >> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
>> >> >> mm/zswap.c | 4 ++--
>> >> >> 3 files changed, 35 insertions(+), 15 deletions(-)
>> >> >>
>> >> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
>> >> >> index 2571a5c..1dbc13e 100644
>> >> >> --- a/include/linux/zbud.h
>> >> >> +++ b/include/linux/zbud.h
>> >> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
>> >> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
>> >> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
>> >> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
>> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
>> >> >> +u64 zbud_get_pool_size(void);
>> >> >>
>> >> >> #endif /* _ZBUD_H_ */
>> >> >> diff --git a/mm/zbud.c b/mm/zbud.c
>> >> >> index 9451361..711aaf4 100644
>> >> >> --- a/mm/zbud.c
>> >> >> +++ b/mm/zbud.c
>> >> >> @@ -52,6 +52,13 @@
>> >> >> #include <linux/spinlock.h>
>> >> >> #include <linux/zbud.h>
>> >> >>
>> >> >> +/*********************************
>> >> >> +* statistics
>> >> >> +**********************************/
>> >> >> +
>> >> >> +/* zbud pages in all pools */
>> >> >> +static u64 total_zbud_pages;
>> >> >> +
>> >> >> /*****************
>> >> >> * Structures
>> >> >> *****************/
>> >> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
>> >> >> return zhdr;
>> >> >> }
>> >> >>
>> >> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
>> >> >> +{
>> >> >> + struct page *page;
>> >> >> +
>> >> >> + page = alloc_page(gfp);
>> >> >> +
>> >> >> + if (page) {
>> >> >> + pool->pages_nr++;
>> >> >> + total_zbud_pages++;
>> >> >
>> >> > Who protect race?
>> >>
>> >> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
>> >> I will re-do it.
>> >>
>> >> I will change *total_zbud_pages* to atomic type.
>> >
>> > Wait, it doesn't make sense. Now, you assume zbud allocator would be used
>> > for only zswap. It's true until now but we couldn't make sure it in future.
>> > If other user start to use zbud allocator, total_zbud_pages would be pointless.
>>
>> Yes, you are right. ZBUD is a common module. So in this patch calculate the
>> zswap pool size in zbud is not suitable.
>>
>> >
>> > Another concern is that what's your scenario for above two swap?
>> > How often we need to call zbud_get_pool_size?
>> > In previous your patch, you reduced the number of call so IIRC,
>> > we only called it in zswap_is_full and for debugfs.
>>
>> zbud_get_pool_size() is called frequently when adding/freeing zswap
>> entry happen in zswap . This is why in this patch I added a counter in zbud,
>> and then in zswap the iteration of zswap_list to calculate the pool size will
>> not be needed.
>
> We can remove updating zswap_pool_pages in zswap_frontswap_store and
> zswap_free_entry as I said. So zswap_is_full is only hot spot.
> Do you think it's still big overhead? Why? Maybe locking to prevent
> destroying? Then, we can use RCU to minimize the overhead as I mentioned.

I get your point. Yes, In my previous patch, zswap_is_full() was the
only path to call
zbud_get_pool_size(). And your suggestion on patch v1 to remove the unnecessary
iteration will reduce the overhead further.

So adding the calculating of all the pool size in zswap.c is better.

>>
>> > Of course, it would need some lock or refcount to prevent destroy
>> > of zswap_tree in parallel with zswap_frontswap_invalidate_area but
>> > zswap_is_full doesn't need to be exact so RCU would be good fit.
>> >
>> > Most important point is that now zswap doesn't consider multiple swap.
>> > For example, Let's assume you uses two swap A and B with different priority
>> > and A already has charged 19% long time ago and let's assume that A swap is
>> > full now so VM start to use B so that B has charged 1% recently.
>> > It menas zswap charged (19% + 1%)i is full by default.
>> >
>> > Then, if VM want to swap out more pages into B, zbud_reclaim_page
>> > would be evict one of pages in B's pool and it would be repeated
>> > continuously. It's totally LRU reverse problem and swap thrashing in B
>> > would happen.
>> >
>>
>> The scenario is below:
>> There are 2 swap A, B in system. If pool size of A reach 19% of ram
>> size and swap A
>> is also full. Then swap B will be used. Pool size of B will be
>> increased until it hit
>> the 20% of the ram size. By now zswap pool size is about 39% of ram size.
>> If there are more than 2 swap file/device, zswap pool will expand out
>> of control
>> and there may be no swapout happened.
>
> I know.
>
>>
>> I think the original intention of zswap designer is to keep the total
>> zswap pools size below
>> 20% of RAM size.
>
> My point is your patch still doesn't solve the example I mentioned.

Hmm. My patch only make sure all the zswap pools use maximum 20% of
RAM size. It is a new problem in your example. The zbud_reclaim_page would
not swap out the oldest zbud page when multiple swaps are used.

Maybe the new problem can be resolved in another patch.

Thanks.

>
>>
>> Thanks.
>>
>> > Please say your usecase scenario and if it's really problem,
>> > we need more surgery.
>> >
>> > Thanks.
>> >
>> >> For *pool->pages_nr*, one way is to use pool->lock to protect. But I
>> >> think it is too heavy.
>> >> So does it ok to change pages_nr to atomic type too?
>> >>
>> >>
>> >> >
>> >> >> + }
>> >> >> +
>> >> >> + return page;
>> >> >> +}
>> >> >> +
>> >> >> +
>> >> >> /* Resets the struct page fields and frees the page */
>> >> >> -static void free_zbud_page(struct zbud_header *zhdr)
>> >> >> +static void free_zbud_page(struct zbud_pool *pool, struct zbud_header *zhdr)
>> >> >> {
>> >> >> __free_page(virt_to_page(zhdr));
>> >> >> +
>> >> >> + pool->pages_nr--;
>> >> >> + total_zbud_pages--;
>> >> >> }
>> >> >>
>> >> >> /*
>> >> >> @@ -279,11 +304,10 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
>> >> >>
>> >> >> /* Couldn't find unbuddied zbud page, create new one */
>> >> >> spin_unlock(&pool->lock);
>> >> >> - page = alloc_page(gfp);
>> >> >> + page = alloc_zbud_page(pool, gfp);
>> >> >> if (!page)
>> >> >> return -ENOMEM;
>> >> >> spin_lock(&pool->lock);
>> >> >> - pool->pages_nr++;
>> >> >> zhdr = init_zbud_page(page);
>> >> >> bud = FIRST;
>> >> >>
>> >> >> @@ -349,8 +373,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
>> >> >> if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
>> >> >> /* zbud page is empty, free */
>> >> >> list_del(&zhdr->lru);
>> >> >> - free_zbud_page(zhdr);
>> >> >> - pool->pages_nr--;
>> >> >> + free_zbud_page(pool, zhdr);
>> >> >> } else {
>> >> >> /* Add to unbuddied list */
>> >> >> freechunks = num_free_chunks(zhdr);
>> >> >> @@ -447,8 +470,7 @@ next:
>> >> >> * Both buddies are now free, free the zbud page and
>> >> >> * return success.
>> >> >> */
>> >> >> - free_zbud_page(zhdr);
>> >> >> - pool->pages_nr--;
>> >> >> + free_zbud_page(pool, zhdr);
>> >> >> spin_unlock(&pool->lock);
>> >> >> return 0;
>> >> >> } else if (zhdr->first_chunks == 0 ||
>> >> >> @@ -496,14 +518,12 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
>> >> >>
>> >> >> /**
>> >> >> * zbud_get_pool_size() - gets the zbud pool size in pages
>> >> >> - * @pool: pool whose size is being queried
>> >> >> *
>> >> >> - * Returns: size in pages of the given pool. The pool lock need not be
>> >> >> - * taken to access pages_nr.
>> >> >> + * Returns: size in pages of all the zbud pools.
>> >> >> */
>> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool)
>> >> >> +u64 zbud_get_pool_size(void)
>> >> >> {
>> >> >> - return pool->pages_nr;
>> >> >> + return total_zbud_pages;
>> >> >> }
>> >> >>
>> >> >> static int __init init_zbud(void)
>> >> >> diff --git a/mm/zswap.c b/mm/zswap.c
>> >> >> index 5a63f78..ef44d9d 100644
>> >> >> --- a/mm/zswap.c
>> >> >> +++ b/mm/zswap.c
>> >> >> @@ -291,7 +291,7 @@ static void zswap_free_entry(struct zswap_tree *tree,
>> >> >> zbud_free(tree->pool, entry->handle);
>> >> >> zswap_entry_cache_free(entry);
>> >> >> atomic_dec(&zswap_stored_pages);
>> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>> >> >> + zswap_pool_pages = zbud_get_pool_size();
>> >> >> }
>> >> >>
>> >> >> /* caller must hold the tree lock */
>> >> >> @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>> >> >>
>> >> >> /* update stats */
>> >> >> atomic_inc(&zswap_stored_pages);
>> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>> >> >> + zswap_pool_pages = zbud_get_pool_size();
>> >> >>
>> >> >> return 0;
>> >> >>
>> >> >> --
>> >> >> 1.7.10.4
>> >> >>
>> >> >> --
>> >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> >> >> the body to [email protected]. For more info on Linux MM,
>> >> >> see: http://www.linux-mm.org/ .
>> >> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>> >> >
>> >> > --
>> >> > Kind regards,
>> >> > Minchan Kim
>> >>
>> >> --
>> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> >> the body to [email protected]. For more info on Linux MM,
>> >> see: http://www.linux-mm.org/ .
>> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>> >
>> > --
>> > Kind regards,
>> > Minchan Kim
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to [email protected]. For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
> --
> Kind regards,
> Minchan Kim

2014-01-22 08:01:30

by Minchan Kim

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

Hello Cai,

On Tue, Jan 21, 2014 at 09:52:25PM +0800, Cai Liu wrote:
> Hello Minchan
>
> 2014/1/21 Minchan Kim <[email protected]>:
> > Hello,
> >
> > On Tue, Jan 21, 2014 at 02:35:07PM +0800, Cai Liu wrote:
> >> 2014/1/21 Minchan Kim <[email protected]>:
> >> > Please check your MUA and don't break thread.
> >> >
> >> > On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
> >> >> Thanks for your review.
> >> >>
> >> >> 2014/1/21 Minchan Kim <[email protected]>:
> >> >> > Hello Cai,
> >> >> >
> >> >> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
> >> >> >> zswap can support multiple swapfiles. So we need to check
> >> >> >> all zbud pool pages in zswap.
> >> >> >>
> >> >> >> Version 2:
> >> >> >> * add *total_zbud_pages* in zbud to record all the pages in pools
> >> >> >> * move the updating of pool pages statistics to
> >> >> >> alloc_zbud_page/free_zbud_page to hide the details
> >> >> >>
> >> >> >> Signed-off-by: Cai Liu <[email protected]>
> >> >> >> ---
> >> >> >> include/linux/zbud.h | 2 +-
> >> >> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
> >> >> >> mm/zswap.c | 4 ++--
> >> >> >> 3 files changed, 35 insertions(+), 15 deletions(-)
> >> >> >>
> >> >> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
> >> >> >> index 2571a5c..1dbc13e 100644
> >> >> >> --- a/include/linux/zbud.h
> >> >> >> +++ b/include/linux/zbud.h
> >> >> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
> >> >> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
> >> >> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
> >> >> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
> >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
> >> >> >> +u64 zbud_get_pool_size(void);
> >> >> >>
> >> >> >> #endif /* _ZBUD_H_ */
> >> >> >> diff --git a/mm/zbud.c b/mm/zbud.c
> >> >> >> index 9451361..711aaf4 100644
> >> >> >> --- a/mm/zbud.c
> >> >> >> +++ b/mm/zbud.c
> >> >> >> @@ -52,6 +52,13 @@
> >> >> >> #include <linux/spinlock.h>
> >> >> >> #include <linux/zbud.h>
> >> >> >>
> >> >> >> +/*********************************
> >> >> >> +* statistics
> >> >> >> +**********************************/
> >> >> >> +
> >> >> >> +/* zbud pages in all pools */
> >> >> >> +static u64 total_zbud_pages;
> >> >> >> +
> >> >> >> /*****************
> >> >> >> * Structures
> >> >> >> *****************/
> >> >> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
> >> >> >> return zhdr;
> >> >> >> }
> >> >> >>
> >> >> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
> >> >> >> +{
> >> >> >> + struct page *page;
> >> >> >> +
> >> >> >> + page = alloc_page(gfp);
> >> >> >> +
> >> >> >> + if (page) {
> >> >> >> + pool->pages_nr++;
> >> >> >> + total_zbud_pages++;
> >> >> >
> >> >> > Who protect race?
> >> >>
> >> >> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
> >> >> I will re-do it.
> >> >>
> >> >> I will change *total_zbud_pages* to atomic type.
> >> >
> >> > Wait, it doesn't make sense. Now, you assume zbud allocator would be used
> >> > for only zswap. It's true until now but we couldn't make sure it in future.
> >> > If other user start to use zbud allocator, total_zbud_pages would be pointless.
> >>
> >> Yes, you are right. ZBUD is a common module. So in this patch calculate the
> >> zswap pool size in zbud is not suitable.
> >>
> >> >
> >> > Another concern is that what's your scenario for above two swap?
> >> > How often we need to call zbud_get_pool_size?
> >> > In previous your patch, you reduced the number of call so IIRC,
> >> > we only called it in zswap_is_full and for debugfs.
> >>
> >> zbud_get_pool_size() is called frequently when adding/freeing zswap
> >> entry happen in zswap . This is why in this patch I added a counter in zbud,
> >> and then in zswap the iteration of zswap_list to calculate the pool size will
> >> not be needed.
> >
> > We can remove updating zswap_pool_pages in zswap_frontswap_store and
> > zswap_free_entry as I said. So zswap_is_full is only hot spot.
> > Do you think it's still big overhead? Why? Maybe locking to prevent
> > destroying? Then, we can use RCU to minimize the overhead as I mentioned.
>
> I get your point. Yes, In my previous patch, zswap_is_full() was the
> only path to call
> zbud_get_pool_size(). And your suggestion on patch v1 to remove the unnecessary
> iteration will reduce the overhead further.
>
> So adding the calculating of all the pool size in zswap.c is better.
>
> >>
> >> > Of course, it would need some lock or refcount to prevent destroy
> >> > of zswap_tree in parallel with zswap_frontswap_invalidate_area but
> >> > zswap_is_full doesn't need to be exact so RCU would be good fit.
> >> >
> >> > Most important point is that now zswap doesn't consider multiple swap.
> >> > For example, Let's assume you uses two swap A and B with different priority
> >> > and A already has charged 19% long time ago and let's assume that A swap is
> >> > full now so VM start to use B so that B has charged 1% recently.
> >> > It menas zswap charged (19% + 1%)i is full by default.
> >> >
> >> > Then, if VM want to swap out more pages into B, zbud_reclaim_page
> >> > would be evict one of pages in B's pool and it would be repeated
> >> > continuously. It's totally LRU reverse problem and swap thrashing in B
> >> > would happen.
> >> >
> >>
> >> The scenario is below:
> >> There are 2 swap A, B in system. If pool size of A reach 19% of ram
> >> size and swap A
> >> is also full. Then swap B will be used. Pool size of B will be
> >> increased until it hit
> >> the 20% of the ram size. By now zswap pool size is about 39% of ram size.
> >> If there are more than 2 swap file/device, zswap pool will expand out
> >> of control
> >> and there may be no swapout happened.
> >
> > I know.
> >
> >>
> >> I think the original intention of zswap designer is to keep the total
> >> zswap pools size below
> >> 20% of RAM size.
> >
> > My point is your patch still doesn't solve the example I mentioned.
>
> Hmm. My patch only make sure all the zswap pools use maximum 20% of
> RAM size. It is a new problem in your example. The zbud_reclaim_page would
> not swap out the oldest zbud page when multiple swaps are used.
>
> Maybe the new problem can be resolved in another patch.

It means current zswap has a problem in multiple swap but you want
to fix a problem which happens only when it is used for multiple swap.
So, I'm not sure we want a fix in this phase before discussing more
fundamental thing.

That's why I want to know why you want to use multiple swap with zswap
but you are never saying it to us. :(

>
> Thanks.
>
> >
> >>
> >> Thanks.
> >>
> >> > Please say your usecase scenario and if it's really problem,
> >> > we need more surgery.
> >> >
> >> > Thanks.
> >> >
> >> >> For *pool->pages_nr*, one way is to use pool->lock to protect. But I
> >> >> think it is too heavy.
> >> >> So does it ok to change pages_nr to atomic type too?
> >> >>
> >> >>
> >> >> >
> >> >> >> + }
> >> >> >> +
> >> >> >> + return page;
> >> >> >> +}
> >> >> >> +
> >> >> >> +
> >> >> >> /* Resets the struct page fields and frees the page */
> >> >> >> -static void free_zbud_page(struct zbud_header *zhdr)
> >> >> >> +static void free_zbud_page(struct zbud_pool *pool, struct zbud_header *zhdr)
> >> >> >> {
> >> >> >> __free_page(virt_to_page(zhdr));
> >> >> >> +
> >> >> >> + pool->pages_nr--;
> >> >> >> + total_zbud_pages--;
> >> >> >> }
> >> >> >>
> >> >> >> /*
> >> >> >> @@ -279,11 +304,10 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
> >> >> >>
> >> >> >> /* Couldn't find unbuddied zbud page, create new one */
> >> >> >> spin_unlock(&pool->lock);
> >> >> >> - page = alloc_page(gfp);
> >> >> >> + page = alloc_zbud_page(pool, gfp);
> >> >> >> if (!page)
> >> >> >> return -ENOMEM;
> >> >> >> spin_lock(&pool->lock);
> >> >> >> - pool->pages_nr++;
> >> >> >> zhdr = init_zbud_page(page);
> >> >> >> bud = FIRST;
> >> >> >>
> >> >> >> @@ -349,8 +373,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
> >> >> >> if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
> >> >> >> /* zbud page is empty, free */
> >> >> >> list_del(&zhdr->lru);
> >> >> >> - free_zbud_page(zhdr);
> >> >> >> - pool->pages_nr--;
> >> >> >> + free_zbud_page(pool, zhdr);
> >> >> >> } else {
> >> >> >> /* Add to unbuddied list */
> >> >> >> freechunks = num_free_chunks(zhdr);
> >> >> >> @@ -447,8 +470,7 @@ next:
> >> >> >> * Both buddies are now free, free the zbud page and
> >> >> >> * return success.
> >> >> >> */
> >> >> >> - free_zbud_page(zhdr);
> >> >> >> - pool->pages_nr--;
> >> >> >> + free_zbud_page(pool, zhdr);
> >> >> >> spin_unlock(&pool->lock);
> >> >> >> return 0;
> >> >> >> } else if (zhdr->first_chunks == 0 ||
> >> >> >> @@ -496,14 +518,12 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
> >> >> >>
> >> >> >> /**
> >> >> >> * zbud_get_pool_size() - gets the zbud pool size in pages
> >> >> >> - * @pool: pool whose size is being queried
> >> >> >> *
> >> >> >> - * Returns: size in pages of the given pool. The pool lock need not be
> >> >> >> - * taken to access pages_nr.
> >> >> >> + * Returns: size in pages of all the zbud pools.
> >> >> >> */
> >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool)
> >> >> >> +u64 zbud_get_pool_size(void)
> >> >> >> {
> >> >> >> - return pool->pages_nr;
> >> >> >> + return total_zbud_pages;
> >> >> >> }
> >> >> >>
> >> >> >> static int __init init_zbud(void)
> >> >> >> diff --git a/mm/zswap.c b/mm/zswap.c
> >> >> >> index 5a63f78..ef44d9d 100644
> >> >> >> --- a/mm/zswap.c
> >> >> >> +++ b/mm/zswap.c
> >> >> >> @@ -291,7 +291,7 @@ static void zswap_free_entry(struct zswap_tree *tree,
> >> >> >> zbud_free(tree->pool, entry->handle);
> >> >> >> zswap_entry_cache_free(entry);
> >> >> >> atomic_dec(&zswap_stored_pages);
> >> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
> >> >> >> + zswap_pool_pages = zbud_get_pool_size();
> >> >> >> }
> >> >> >>
> >> >> >> /* caller must hold the tree lock */
> >> >> >> @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> >> >> >>
> >> >> >> /* update stats */
> >> >> >> atomic_inc(&zswap_stored_pages);
> >> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
> >> >> >> + zswap_pool_pages = zbud_get_pool_size();
> >> >> >>
> >> >> >> return 0;
> >> >> >>
> >> >> >> --
> >> >> >> 1.7.10.4
> >> >> >>
> >> >> >> --
> >> >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >> >> >> the body to [email protected]. For more info on Linux MM,
> >> >> >> see: http://www.linux-mm.org/ .
> >> >> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> >> >> >
> >> >> > --
> >> >> > Kind regards,
> >> >> > Minchan Kim
> >> >>
> >> >> --
> >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >> >> the body to [email protected]. For more info on Linux MM,
> >> >> see: http://www.linux-mm.org/ .
> >> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> >> >
> >> > --
> >> > Kind regards,
> >> > Minchan Kim
> >>
> >> --
> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >> the body to [email protected]. For more info on Linux MM,
> >> see: http://www.linux-mm.org/ .
> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> >
> > --
> > Kind regards,
> > Minchan Kim
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Kind regards,
Minchan Kim

2014-01-22 12:16:31

by Cai Liu

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

Hello Minchan


2014/1/22 Minchan Kim <[email protected]>
>
> Hello Cai,
>
> On Tue, Jan 21, 2014 at 09:52:25PM +0800, Cai Liu wrote:
> > Hello Minchan
> >
> > 2014/1/21 Minchan Kim <[email protected]>:
> > > Hello,
> > >
> > > On Tue, Jan 21, 2014 at 02:35:07PM +0800, Cai Liu wrote:
> > >> 2014/1/21 Minchan Kim <[email protected]>:
> > >> > Please check your MUA and don't break thread.
> > >> >
> > >> > On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
> > >> >> Thanks for your review.
> > >> >>
> > >> >> 2014/1/21 Minchan Kim <[email protected]>:
> > >> >> > Hello Cai,
> > >> >> >
> > >> >> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
> > >> >> >> zswap can support multiple swapfiles. So we need to check
> > >> >> >> all zbud pool pages in zswap.
> > >> >> >>
> > >> >> >> Version 2:
> > >> >> >> * add *total_zbud_pages* in zbud to record all the pages in pools
> > >> >> >> * move the updating of pool pages statistics to
> > >> >> >> alloc_zbud_page/free_zbud_page to hide the details
> > >> >> >>
> > >> >> >> Signed-off-by: Cai Liu <[email protected]>
> > >> >> >> ---
> > >> >> >> include/linux/zbud.h | 2 +-
> > >> >> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
> > >> >> >> mm/zswap.c | 4 ++--
> > >> >> >> 3 files changed, 35 insertions(+), 15 deletions(-)
> > >> >> >>
> > >> >> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
> > >> >> >> index 2571a5c..1dbc13e 100644
> > >> >> >> --- a/include/linux/zbud.h
> > >> >> >> +++ b/include/linux/zbud.h
> > >> >> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
> > >> >> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
> > >> >> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
> > >> >> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
> > >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
> > >> >> >> +u64 zbud_get_pool_size(void);
> > >> >> >>
> > >> >> >> #endif /* _ZBUD_H_ */
> > >> >> >> diff --git a/mm/zbud.c b/mm/zbud.c
> > >> >> >> index 9451361..711aaf4 100644
> > >> >> >> --- a/mm/zbud.c
> > >> >> >> +++ b/mm/zbud.c
> > >> >> >> @@ -52,6 +52,13 @@
> > >> >> >> #include <linux/spinlock.h>
> > >> >> >> #include <linux/zbud.h>
> > >> >> >>
> > >> >> >> +/*********************************
> > >> >> >> +* statistics
> > >> >> >> +**********************************/
> > >> >> >> +
> > >> >> >> +/* zbud pages in all pools */
> > >> >> >> +static u64 total_zbud_pages;
> > >> >> >> +
> > >> >> >> /*****************
> > >> >> >> * Structures
> > >> >> >> *****************/
> > >> >> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
> > >> >> >> return zhdr;
> > >> >> >> }
> > >> >> >>
> > >> >> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
> > >> >> >> +{
> > >> >> >> + struct page *page;
> > >> >> >> +
> > >> >> >> + page = alloc_page(gfp);
> > >> >> >> +
> > >> >> >> + if (page) {
> > >> >> >> + pool->pages_nr++;
> > >> >> >> + total_zbud_pages++;
> > >> >> >
> > >> >> > Who protect race?
> > >> >>
> > >> >> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
> > >> >> I will re-do it.
> > >> >>
> > >> >> I will change *total_zbud_pages* to atomic type.
> > >> >
> > >> > Wait, it doesn't make sense. Now, you assume zbud allocator would be used
> > >> > for only zswap. It's true until now but we couldn't make sure it in future.
> > >> > If other user start to use zbud allocator, total_zbud_pages would be pointless.
> > >>
> > >> Yes, you are right. ZBUD is a common module. So in this patch calculate the
> > >> zswap pool size in zbud is not suitable.
> > >>
> > >> >
> > >> > Another concern is that what's your scenario for above two swap?
> > >> > How often we need to call zbud_get_pool_size?
> > >> > In previous your patch, you reduced the number of call so IIRC,
> > >> > we only called it in zswap_is_full and for debugfs.
> > >>
> > >> zbud_get_pool_size() is called frequently when adding/freeing zswap
> > >> entry happen in zswap . This is why in this patch I added a counter in zbud,
> > >> and then in zswap the iteration of zswap_list to calculate the pool size will
> > >> not be needed.
> > >
> > > We can remove updating zswap_pool_pages in zswap_frontswap_store and
> > > zswap_free_entry as I said. So zswap_is_full is only hot spot.
> > > Do you think it's still big overhead? Why? Maybe locking to prevent
> > > destroying? Then, we can use RCU to minimize the overhead as I mentioned.
> >
> > I get your point. Yes, In my previous patch, zswap_is_full() was the
> > only path to call
> > zbud_get_pool_size(). And your suggestion on patch v1 to remove the unnecessary
> > iteration will reduce the overhead further.
> >
> > So adding the calculating of all the pool size in zswap.c is better.
> >
> > >>
> > >> > Of course, it would need some lock or refcount to prevent destroy
> > >> > of zswap_tree in parallel with zswap_frontswap_invalidate_area but
> > >> > zswap_is_full doesn't need to be exact so RCU would be good fit.
> > >> >
> > >> > Most important point is that now zswap doesn't consider multiple swap.
> > >> > For example, Let's assume you uses two swap A and B with different priority
> > >> > and A already has charged 19% long time ago and let's assume that A swap is
> > >> > full now so VM start to use B so that B has charged 1% recently.
> > >> > It menas zswap charged (19% + 1%)i is full by default.
> > >> >
> > >> > Then, if VM want to swap out more pages into B, zbud_reclaim_page
> > >> > would be evict one of pages in B's pool and it would be repeated
> > >> > continuously. It's totally LRU reverse problem and swap thrashing in B
> > >> > would happen.
> > >> >
> > >>
> > >> The scenario is below:
> > >> There are 2 swap A, B in system. If pool size of A reach 19% of ram
> > >> size and swap A
> > >> is also full. Then swap B will be used. Pool size of B will be
> > >> increased until it hit
> > >> the 20% of the ram size. By now zswap pool size is about 39% of ram size.
> > >> If there are more than 2 swap file/device, zswap pool will expand out
> > >> of control
> > >> and there may be no swapout happened.
> > >
> > > I know.
> > >
> > >>
> > >> I think the original intention of zswap designer is to keep the total
> > >> zswap pools size below
> > >> 20% of RAM size.
> > >
> > > My point is your patch still doesn't solve the example I mentioned.
> >
> > Hmm. My patch only make sure all the zswap pools use maximum 20% of
> > RAM size. It is a new problem in your example. The zbud_reclaim_page would
> > not swap out the oldest zbud page when multiple swaps are used.
> >
> > Maybe the new problem can be resolved in another patch.
>
> It means current zswap has a problem in multiple swap but you want
> to fix a problem which happens only when it is used for multiple swap.
> So, I'm not sure we want a fix in this phase before discussing more
> fundamental thing.
>

Yes, The bug which I want to fix only happens when multiple swap are used.

> That's why I want to know why you want to use multiple swap with zswap
> but you are never saying it to us. :(
>

If user uses more than one swap device/file, then this is an issue.
Zswap pool is created when a swap device/file is swapped on happens.
So there will be more than one zswap pool when user uses 2 or even
more swap devices/files.

I am not sure whether multiple swap are popular. But if multiple swap
are swapped
on, then multiple zswap pool will be created. And the size of these pools may
out of control.

Thanks.

> >
> > Thanks.
> >
> > >
> > >>
> > >> Thanks.
> > >>
> > >> > Please say your usecase scenario and if it's really problem,
> > >> > we need more surgery.
> > >> >
> > >> > Thanks.
> > >> >
> > >> >> For *pool->pages_nr*, one way is to use pool->lock to protect. But I
> > >> >> think it is too heavy.
> > >> >> So does it ok to change pages_nr to atomic type too?
> > >> >>
> > >> >>
> > >> >> >
> > >> >> >> + }
> > >> >> >> +
> > >> >> >> + return page;
> > >> >> >> +}
> > >> >> >> +
> > >> >> >> +
> > >> >> >> /* Resets the struct page fields and frees the page */
> > >> >> >> -static void free_zbud_page(struct zbud_header *zhdr)
> > >> >> >> +static void free_zbud_page(struct zbud_pool *pool, struct zbud_header *zhdr)
> > >> >> >> {
> > >> >> >> __free_page(virt_to_page(zhdr));
> > >> >> >> +
> > >> >> >> + pool->pages_nr--;
> > >> >> >> + total_zbud_pages--;
> > >> >> >> }
> > >> >> >>
> > >> >> >> /*
> > >> >> >> @@ -279,11 +304,10 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
> > >> >> >>
> > >> >> >> /* Couldn't find unbuddied zbud page, create new one */
> > >> >> >> spin_unlock(&pool->lock);
> > >> >> >> - page = alloc_page(gfp);
> > >> >> >> + page = alloc_zbud_page(pool, gfp);
> > >> >> >> if (!page)
> > >> >> >> return -ENOMEM;
> > >> >> >> spin_lock(&pool->lock);
> > >> >> >> - pool->pages_nr++;
> > >> >> >> zhdr = init_zbud_page(page);
> > >> >> >> bud = FIRST;
> > >> >> >>
> > >> >> >> @@ -349,8 +373,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
> > >> >> >> if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
> > >> >> >> /* zbud page is empty, free */
> > >> >> >> list_del(&zhdr->lru);
> > >> >> >> - free_zbud_page(zhdr);
> > >> >> >> - pool->pages_nr--;
> > >> >> >> + free_zbud_page(pool, zhdr);
> > >> >> >> } else {
> > >> >> >> /* Add to unbuddied list */
> > >> >> >> freechunks = num_free_chunks(zhdr);
> > >> >> >> @@ -447,8 +470,7 @@ next:
> > >> >> >> * Both buddies are now free, free the zbud page and
> > >> >> >> * return success.
> > >> >> >> */
> > >> >> >> - free_zbud_page(zhdr);
> > >> >> >> - pool->pages_nr--;
> > >> >> >> + free_zbud_page(pool, zhdr);
> > >> >> >> spin_unlock(&pool->lock);
> > >> >> >> return 0;
> > >> >> >> } else if (zhdr->first_chunks == 0 ||
> > >> >> >> @@ -496,14 +518,12 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
> > >> >> >>
> > >> >> >> /**
> > >> >> >> * zbud_get_pool_size() - gets the zbud pool size in pages
> > >> >> >> - * @pool: pool whose size is being queried
> > >> >> >> *
> > >> >> >> - * Returns: size in pages of the given pool. The pool lock need not be
> > >> >> >> - * taken to access pages_nr.
> > >> >> >> + * Returns: size in pages of all the zbud pools.
> > >> >> >> */
> > >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool)
> > >> >> >> +u64 zbud_get_pool_size(void)
> > >> >> >> {
> > >> >> >> - return pool->pages_nr;
> > >> >> >> + return total_zbud_pages;
> > >> >> >> }
> > >> >> >>
> > >> >> >> static int __init init_zbud(void)
> > >> >> >> diff --git a/mm/zswap.c b/mm/zswap.c
> > >> >> >> index 5a63f78..ef44d9d 100644
> > >> >> >> --- a/mm/zswap.c
> > >> >> >> +++ b/mm/zswap.c
> > >> >> >> @@ -291,7 +291,7 @@ static void zswap_free_entry(struct zswap_tree *tree,
> > >> >> >> zbud_free(tree->pool, entry->handle);
> > >> >> >> zswap_entry_cache_free(entry);
> > >> >> >> atomic_dec(&zswap_stored_pages);
> > >> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
> > >> >> >> + zswap_pool_pages = zbud_get_pool_size();
> > >> >> >> }
> > >> >> >>
> > >> >> >> /* caller must hold the tree lock */
> > >> >> >> @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> > >> >> >>
> > >> >> >> /* update stats */
> > >> >> >> atomic_inc(&zswap_stored_pages);
> > >> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
> > >> >> >> + zswap_pool_pages = zbud_get_pool_size();
> > >> >> >>
> > >> >> >> return 0;
> > >> >> >>
> > >> >> >> --
> > >> >> >> 1.7.10.4
> > >> >> >>
> > >> >> >> --
> > >> >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > >> >> >> the body to [email protected]. For more info on Linux MM,
> > >> >> >> see: http://www.linux-mm.org/ .
> > >> >> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> > >> >> >
> > >> >> > --
> > >> >> > Kind regards,
> > >> >> > Minchan Kim
> > >> >>
> > >> >> --
> > >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > >> >> the body to [email protected]. For more info on Linux MM,
> > >> >> see: http://www.linux-mm.org/ .
> > >> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> > >> >
> > >> > --
> > >> > Kind regards,
> > >> > Minchan Kim
> > >>
> > >> --
> > >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > >> the body to [email protected]. For more info on Linux MM,
> > >> see: http://www.linux-mm.org/ .
> > >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> > >
> > > --
> > > Kind regards,
> > > Minchan Kim
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to [email protected]. For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
> --
> Kind regards,
> Minchan Kim

2014-01-22 14:15:03

by Dan Streetman

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

On Wed, Jan 22, 2014 at 7:16 AM, Cai Liu <[email protected]> wrote:
> Hello Minchan
>
>
> 2014/1/22 Minchan Kim <[email protected]>
>>
>> Hello Cai,
>>
>> On Tue, Jan 21, 2014 at 09:52:25PM +0800, Cai Liu wrote:
>> > Hello Minchan
>> >
>> > 2014/1/21 Minchan Kim <[email protected]>:
>> > > Hello,
>> > >
>> > > On Tue, Jan 21, 2014 at 02:35:07PM +0800, Cai Liu wrote:
>> > >> 2014/1/21 Minchan Kim <[email protected]>:
>> > >> > Please check your MUA and don't break thread.
>> > >> >
>> > >> > On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
>> > >> >> Thanks for your review.
>> > >> >>
>> > >> >> 2014/1/21 Minchan Kim <[email protected]>:
>> > >> >> > Hello Cai,
>> > >> >> >
>> > >> >> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
>> > >> >> >> zswap can support multiple swapfiles. So we need to check
>> > >> >> >> all zbud pool pages in zswap.
>> > >> >> >>
>> > >> >> >> Version 2:
>> > >> >> >> * add *total_zbud_pages* in zbud to record all the pages in pools
>> > >> >> >> * move the updating of pool pages statistics to
>> > >> >> >> alloc_zbud_page/free_zbud_page to hide the details
>> > >> >> >>
>> > >> >> >> Signed-off-by: Cai Liu <[email protected]>
>> > >> >> >> ---
>> > >> >> >> include/linux/zbud.h | 2 +-
>> > >> >> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
>> > >> >> >> mm/zswap.c | 4 ++--
>> > >> >> >> 3 files changed, 35 insertions(+), 15 deletions(-)
>> > >> >> >>
>> > >> >> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
>> > >> >> >> index 2571a5c..1dbc13e 100644
>> > >> >> >> --- a/include/linux/zbud.h
>> > >> >> >> +++ b/include/linux/zbud.h
>> > >> >> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
>> > >> >> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
>> > >> >> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
>> > >> >> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
>> > >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
>> > >> >> >> +u64 zbud_get_pool_size(void);
>> > >> >> >>
>> > >> >> >> #endif /* _ZBUD_H_ */
>> > >> >> >> diff --git a/mm/zbud.c b/mm/zbud.c
>> > >> >> >> index 9451361..711aaf4 100644
>> > >> >> >> --- a/mm/zbud.c
>> > >> >> >> +++ b/mm/zbud.c
>> > >> >> >> @@ -52,6 +52,13 @@
>> > >> >> >> #include <linux/spinlock.h>
>> > >> >> >> #include <linux/zbud.h>
>> > >> >> >>
>> > >> >> >> +/*********************************
>> > >> >> >> +* statistics
>> > >> >> >> +**********************************/
>> > >> >> >> +
>> > >> >> >> +/* zbud pages in all pools */
>> > >> >> >> +static u64 total_zbud_pages;
>> > >> >> >> +
>> > >> >> >> /*****************
>> > >> >> >> * Structures
>> > >> >> >> *****************/
>> > >> >> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
>> > >> >> >> return zhdr;
>> > >> >> >> }
>> > >> >> >>
>> > >> >> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
>> > >> >> >> +{
>> > >> >> >> + struct page *page;
>> > >> >> >> +
>> > >> >> >> + page = alloc_page(gfp);
>> > >> >> >> +
>> > >> >> >> + if (page) {
>> > >> >> >> + pool->pages_nr++;
>> > >> >> >> + total_zbud_pages++;
>> > >> >> >
>> > >> >> > Who protect race?
>> > >> >>
>> > >> >> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
>> > >> >> I will re-do it.
>> > >> >>
>> > >> >> I will change *total_zbud_pages* to atomic type.
>> > >> >
>> > >> > Wait, it doesn't make sense. Now, you assume zbud allocator would be used
>> > >> > for only zswap. It's true until now but we couldn't make sure it in future.
>> > >> > If other user start to use zbud allocator, total_zbud_pages would be pointless.
>> > >>
>> > >> Yes, you are right. ZBUD is a common module. So in this patch calculate the
>> > >> zswap pool size in zbud is not suitable.
>> > >>
>> > >> >
>> > >> > Another concern is that what's your scenario for above two swap?
>> > >> > How often we need to call zbud_get_pool_size?
>> > >> > In previous your patch, you reduced the number of call so IIRC,
>> > >> > we only called it in zswap_is_full and for debugfs.
>> > >>
>> > >> zbud_get_pool_size() is called frequently when adding/freeing zswap
>> > >> entry happen in zswap . This is why in this patch I added a counter in zbud,
>> > >> and then in zswap the iteration of zswap_list to calculate the pool size will
>> > >> not be needed.
>> > >
>> > > We can remove updating zswap_pool_pages in zswap_frontswap_store and
>> > > zswap_free_entry as I said. So zswap_is_full is only hot spot.
>> > > Do you think it's still big overhead? Why? Maybe locking to prevent
>> > > destroying? Then, we can use RCU to minimize the overhead as I mentioned.
>> >
>> > I get your point. Yes, In my previous patch, zswap_is_full() was the
>> > only path to call
>> > zbud_get_pool_size(). And your suggestion on patch v1 to remove the unnecessary
>> > iteration will reduce the overhead further.
>> >
>> > So adding the calculating of all the pool size in zswap.c is better.
>> >
>> > >>
>> > >> > Of course, it would need some lock or refcount to prevent destroy
>> > >> > of zswap_tree in parallel with zswap_frontswap_invalidate_area but
>> > >> > zswap_is_full doesn't need to be exact so RCU would be good fit.
>> > >> >
>> > >> > Most important point is that now zswap doesn't consider multiple swap.
>> > >> > For example, Let's assume you uses two swap A and B with different priority
>> > >> > and A already has charged 19% long time ago and let's assume that A swap is
>> > >> > full now so VM start to use B so that B has charged 1% recently.
>> > >> > It menas zswap charged (19% + 1%)i is full by default.
>> > >> >
>> > >> > Then, if VM want to swap out more pages into B, zbud_reclaim_page
>> > >> > would be evict one of pages in B's pool and it would be repeated
>> > >> > continuously. It's totally LRU reverse problem and swap thrashing in B
>> > >> > would happen.
>> > >> >
>> > >>
>> > >> The scenario is below:
>> > >> There are 2 swap A, B in system. If pool size of A reach 19% of ram
>> > >> size and swap A
>> > >> is also full. Then swap B will be used. Pool size of B will be
>> > >> increased until it hit
>> > >> the 20% of the ram size. By now zswap pool size is about 39% of ram size.
>> > >> If there are more than 2 swap file/device, zswap pool will expand out
>> > >> of control
>> > >> and there may be no swapout happened.
>> > >
>> > > I know.
>> > >
>> > >>
>> > >> I think the original intention of zswap designer is to keep the total
>> > >> zswap pools size below
>> > >> 20% of RAM size.
>> > >
>> > > My point is your patch still doesn't solve the example I mentioned.
>> >
>> > Hmm. My patch only make sure all the zswap pools use maximum 20% of
>> > RAM size. It is a new problem in your example. The zbud_reclaim_page would
>> > not swap out the oldest zbud page when multiple swaps are used.
>> >
>> > Maybe the new problem can be resolved in another patch.
>>
>> It means current zswap has a problem in multiple swap but you want
>> to fix a problem which happens only when it is used for multiple swap.
>> So, I'm not sure we want a fix in this phase before discussing more
>> fundamental thing.
>>
>
> Yes, The bug which I want to fix only happens when multiple swap are used.
>
>> That's why I want to know why you want to use multiple swap with zswap
>> but you are never saying it to us. :(
>>
>
> If user uses more than one swap device/file, then this is an issue.
> Zswap pool is created when a swap device/file is swapped on happens.
> So there will be more than one zswap pool when user uses 2 or even
> more swap devices/files.
>
> I am not sure whether multiple swap are popular. But if multiple swap
> are swapped
> on, then multiple zswap pool will be created. And the size of these pools may
> out of control.

Personally I don't think using multiple swap partitions/files has to
be popular to need to solve this, it only needs to be possible, which
it is.

Why not just leave zbud unchanged, and sum up the total size using a
list of active zswap_trees as Minchan suggested for the v1 patch? The
debugfs_create_u64("pool_pages") will probably need to be changed to
debugfs_create_file() with a read function that calls the function to
sum up the total.


>
> Thanks.
>
>> >
>> > Thanks.
>> >
>> > >
>> > >>
>> > >> Thanks.
>> > >>
>> > >> > Please say your usecase scenario and if it's really problem,
>> > >> > we need more surgery.
>> > >> >
>> > >> > Thanks.
>> > >> >
>> > >> >> For *pool->pages_nr*, one way is to use pool->lock to protect. But I
>> > >> >> think it is too heavy.
>> > >> >> So does it ok to change pages_nr to atomic type too?
>> > >> >>
>> > >> >>
>> > >> >> >
>> > >> >> >> + }
>> > >> >> >> +
>> > >> >> >> + return page;
>> > >> >> >> +}
>> > >> >> >> +
>> > >> >> >> +
>> > >> >> >> /* Resets the struct page fields and frees the page */
>> > >> >> >> -static void free_zbud_page(struct zbud_header *zhdr)
>> > >> >> >> +static void free_zbud_page(struct zbud_pool *pool, struct zbud_header *zhdr)
>> > >> >> >> {
>> > >> >> >> __free_page(virt_to_page(zhdr));
>> > >> >> >> +
>> > >> >> >> + pool->pages_nr--;
>> > >> >> >> + total_zbud_pages--;
>> > >> >> >> }
>> > >> >> >>
>> > >> >> >> /*
>> > >> >> >> @@ -279,11 +304,10 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
>> > >> >> >>
>> > >> >> >> /* Couldn't find unbuddied zbud page, create new one */
>> > >> >> >> spin_unlock(&pool->lock);
>> > >> >> >> - page = alloc_page(gfp);
>> > >> >> >> + page = alloc_zbud_page(pool, gfp);
>> > >> >> >> if (!page)
>> > >> >> >> return -ENOMEM;
>> > >> >> >> spin_lock(&pool->lock);
>> > >> >> >> - pool->pages_nr++;
>> > >> >> >> zhdr = init_zbud_page(page);
>> > >> >> >> bud = FIRST;
>> > >> >> >>
>> > >> >> >> @@ -349,8 +373,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
>> > >> >> >> if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
>> > >> >> >> /* zbud page is empty, free */
>> > >> >> >> list_del(&zhdr->lru);
>> > >> >> >> - free_zbud_page(zhdr);
>> > >> >> >> - pool->pages_nr--;
>> > >> >> >> + free_zbud_page(pool, zhdr);
>> > >> >> >> } else {
>> > >> >> >> /* Add to unbuddied list */
>> > >> >> >> freechunks = num_free_chunks(zhdr);
>> > >> >> >> @@ -447,8 +470,7 @@ next:
>> > >> >> >> * Both buddies are now free, free the zbud page and
>> > >> >> >> * return success.
>> > >> >> >> */
>> > >> >> >> - free_zbud_page(zhdr);
>> > >> >> >> - pool->pages_nr--;
>> > >> >> >> + free_zbud_page(pool, zhdr);
>> > >> >> >> spin_unlock(&pool->lock);
>> > >> >> >> return 0;
>> > >> >> >> } else if (zhdr->first_chunks == 0 ||
>> > >> >> >> @@ -496,14 +518,12 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
>> > >> >> >>
>> > >> >> >> /**
>> > >> >> >> * zbud_get_pool_size() - gets the zbud pool size in pages
>> > >> >> >> - * @pool: pool whose size is being queried
>> > >> >> >> *
>> > >> >> >> - * Returns: size in pages of the given pool. The pool lock need not be
>> > >> >> >> - * taken to access pages_nr.
>> > >> >> >> + * Returns: size in pages of all the zbud pools.
>> > >> >> >> */
>> > >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool)
>> > >> >> >> +u64 zbud_get_pool_size(void)
>> > >> >> >> {
>> > >> >> >> - return pool->pages_nr;
>> > >> >> >> + return total_zbud_pages;
>> > >> >> >> }
>> > >> >> >>
>> > >> >> >> static int __init init_zbud(void)
>> > >> >> >> diff --git a/mm/zswap.c b/mm/zswap.c
>> > >> >> >> index 5a63f78..ef44d9d 100644
>> > >> >> >> --- a/mm/zswap.c
>> > >> >> >> +++ b/mm/zswap.c
>> > >> >> >> @@ -291,7 +291,7 @@ static void zswap_free_entry(struct zswap_tree *tree,
>> > >> >> >> zbud_free(tree->pool, entry->handle);
>> > >> >> >> zswap_entry_cache_free(entry);
>> > >> >> >> atomic_dec(&zswap_stored_pages);
>> > >> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>> > >> >> >> + zswap_pool_pages = zbud_get_pool_size();
>> > >> >> >> }
>> > >> >> >>
>> > >> >> >> /* caller must hold the tree lock */
>> > >> >> >> @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>> > >> >> >>
>> > >> >> >> /* update stats */
>> > >> >> >> atomic_inc(&zswap_stored_pages);
>> > >> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>> > >> >> >> + zswap_pool_pages = zbud_get_pool_size();
>> > >> >> >>
>> > >> >> >> return 0;
>> > >> >> >>
>> > >> >> >> --
>> > >> >> >> 1.7.10.4
>> > >> >> >>
>> > >> >> >> --
>> > >> >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> > >> >> >> the body to [email protected]. For more info on Linux MM,
>> > >> >> >> see: http://www.linux-mm.org/ .
>> > >> >> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>> > >> >> >
>> > >> >> > --
>> > >> >> > Kind regards,
>> > >> >> > Minchan Kim
>> > >> >>
>> > >> >> --
>> > >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> > >> >> the body to [email protected]. For more info on Linux MM,
>> > >> >> see: http://www.linux-mm.org/ .
>> > >> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>> > >> >
>> > >> > --
>> > >> > Kind regards,
>> > >> > Minchan Kim
>> > >>
>> > >> --
>> > >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> > >> the body to [email protected]. For more info on Linux MM,
>> > >> see: http://www.linux-mm.org/ .
>> > >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>> > >
>> > > --
>> > > Kind regards,
>> > > Minchan Kim
>> >
>> > --
>> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> > the body to [email protected]. For more info on Linux MM,
>> > see: http://www.linux-mm.org/ .
>> > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>>
>> --
>> Kind regards,
>> Minchan Kim
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2014-01-23 01:38:43

by Cai Liu

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

Hello Dan

2014/1/22 Dan Streetman <[email protected]>:
> On Wed, Jan 22, 2014 at 7:16 AM, Cai Liu <[email protected]> wrote:
>> Hello Minchan
>>
>>
>> 2014/1/22 Minchan Kim <[email protected]>
>>>
>>> Hello Cai,
>>>
>>> On Tue, Jan 21, 2014 at 09:52:25PM +0800, Cai Liu wrote:
>>> > Hello Minchan
>>> >
>>> > 2014/1/21 Minchan Kim <[email protected]>:
>>> > > Hello,
>>> > >
>>> > > On Tue, Jan 21, 2014 at 02:35:07PM +0800, Cai Liu wrote:
>>> > >> 2014/1/21 Minchan Kim <[email protected]>:
>>> > >> > Please check your MUA and don't break thread.
>>> > >> >
>>> > >> > On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
>>> > >> >> Thanks for your review.
>>> > >> >>
>>> > >> >> 2014/1/21 Minchan Kim <[email protected]>:
>>> > >> >> > Hello Cai,
>>> > >> >> >
>>> > >> >> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
>>> > >> >> >> zswap can support multiple swapfiles. So we need to check
>>> > >> >> >> all zbud pool pages in zswap.
>>> > >> >> >>
>>> > >> >> >> Version 2:
>>> > >> >> >> * add *total_zbud_pages* in zbud to record all the pages in pools
>>> > >> >> >> * move the updating of pool pages statistics to
>>> > >> >> >> alloc_zbud_page/free_zbud_page to hide the details
>>> > >> >> >>
>>> > >> >> >> Signed-off-by: Cai Liu <[email protected]>
>>> > >> >> >> ---
>>> > >> >> >> include/linux/zbud.h | 2 +-
>>> > >> >> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
>>> > >> >> >> mm/zswap.c | 4 ++--
>>> > >> >> >> 3 files changed, 35 insertions(+), 15 deletions(-)
>>> > >> >> >>
>>> > >> >> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
>>> > >> >> >> index 2571a5c..1dbc13e 100644
>>> > >> >> >> --- a/include/linux/zbud.h
>>> > >> >> >> +++ b/include/linux/zbud.h
>>> > >> >> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
>>> > >> >> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
>>> > >> >> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
>>> > >> >> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
>>> > >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
>>> > >> >> >> +u64 zbud_get_pool_size(void);
>>> > >> >> >>
>>> > >> >> >> #endif /* _ZBUD_H_ */
>>> > >> >> >> diff --git a/mm/zbud.c b/mm/zbud.c
>>> > >> >> >> index 9451361..711aaf4 100644
>>> > >> >> >> --- a/mm/zbud.c
>>> > >> >> >> +++ b/mm/zbud.c
>>> > >> >> >> @@ -52,6 +52,13 @@
>>> > >> >> >> #include <linux/spinlock.h>
>>> > >> >> >> #include <linux/zbud.h>
>>> > >> >> >>
>>> > >> >> >> +/*********************************
>>> > >> >> >> +* statistics
>>> > >> >> >> +**********************************/
>>> > >> >> >> +
>>> > >> >> >> +/* zbud pages in all pools */
>>> > >> >> >> +static u64 total_zbud_pages;
>>> > >> >> >> +
>>> > >> >> >> /*****************
>>> > >> >> >> * Structures
>>> > >> >> >> *****************/
>>> > >> >> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
>>> > >> >> >> return zhdr;
>>> > >> >> >> }
>>> > >> >> >>
>>> > >> >> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
>>> > >> >> >> +{
>>> > >> >> >> + struct page *page;
>>> > >> >> >> +
>>> > >> >> >> + page = alloc_page(gfp);
>>> > >> >> >> +
>>> > >> >> >> + if (page) {
>>> > >> >> >> + pool->pages_nr++;
>>> > >> >> >> + total_zbud_pages++;
>>> > >> >> >
>>> > >> >> > Who protect race?
>>> > >> >>
>>> > >> >> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
>>> > >> >> I will re-do it.
>>> > >> >>
>>> > >> >> I will change *total_zbud_pages* to atomic type.
>>> > >> >
>>> > >> > Wait, it doesn't make sense. Now, you assume zbud allocator would be used
>>> > >> > for only zswap. It's true until now but we couldn't make sure it in future.
>>> > >> > If other user start to use zbud allocator, total_zbud_pages would be pointless.
>>> > >>
>>> > >> Yes, you are right. ZBUD is a common module. So in this patch calculate the
>>> > >> zswap pool size in zbud is not suitable.
>>> > >>
>>> > >> >
>>> > >> > Another concern is that what's your scenario for above two swap?
>>> > >> > How often we need to call zbud_get_pool_size?
>>> > >> > In previous your patch, you reduced the number of call so IIRC,
>>> > >> > we only called it in zswap_is_full and for debugfs.
>>> > >>
>>> > >> zbud_get_pool_size() is called frequently when adding/freeing zswap
>>> > >> entry happen in zswap . This is why in this patch I added a counter in zbud,
>>> > >> and then in zswap the iteration of zswap_list to calculate the pool size will
>>> > >> not be needed.
>>> > >
>>> > > We can remove updating zswap_pool_pages in zswap_frontswap_store and
>>> > > zswap_free_entry as I said. So zswap_is_full is only hot spot.
>>> > > Do you think it's still big overhead? Why? Maybe locking to prevent
>>> > > destroying? Then, we can use RCU to minimize the overhead as I mentioned.
>>> >
>>> > I get your point. Yes, In my previous patch, zswap_is_full() was the
>>> > only path to call
>>> > zbud_get_pool_size(). And your suggestion on patch v1 to remove the unnecessary
>>> > iteration will reduce the overhead further.
>>> >
>>> > So adding the calculating of all the pool size in zswap.c is better.
>>> >
>>> > >>
>>> > >> > Of course, it would need some lock or refcount to prevent destroy
>>> > >> > of zswap_tree in parallel with zswap_frontswap_invalidate_area but
>>> > >> > zswap_is_full doesn't need to be exact so RCU would be good fit.
>>> > >> >
>>> > >> > Most important point is that now zswap doesn't consider multiple swap.
>>> > >> > For example, Let's assume you uses two swap A and B with different priority
>>> > >> > and A already has charged 19% long time ago and let's assume that A swap is
>>> > >> > full now so VM start to use B so that B has charged 1% recently.
>>> > >> > It menas zswap charged (19% + 1%)i is full by default.
>>> > >> >
>>> > >> > Then, if VM want to swap out more pages into B, zbud_reclaim_page
>>> > >> > would be evict one of pages in B's pool and it would be repeated
>>> > >> > continuously. It's totally LRU reverse problem and swap thrashing in B
>>> > >> > would happen.
>>> > >> >
>>> > >>
>>> > >> The scenario is below:
>>> > >> There are 2 swap A, B in system. If pool size of A reach 19% of ram
>>> > >> size and swap A
>>> > >> is also full. Then swap B will be used. Pool size of B will be
>>> > >> increased until it hit
>>> > >> the 20% of the ram size. By now zswap pool size is about 39% of ram size.
>>> > >> If there are more than 2 swap file/device, zswap pool will expand out
>>> > >> of control
>>> > >> and there may be no swapout happened.
>>> > >
>>> > > I know.
>>> > >
>>> > >>
>>> > >> I think the original intention of zswap designer is to keep the total
>>> > >> zswap pools size below
>>> > >> 20% of RAM size.
>>> > >
>>> > > My point is your patch still doesn't solve the example I mentioned.
>>> >
>>> > Hmm. My patch only make sure all the zswap pools use maximum 20% of
>>> > RAM size. It is a new problem in your example. The zbud_reclaim_page would
>>> > not swap out the oldest zbud page when multiple swaps are used.
>>> >
>>> > Maybe the new problem can be resolved in another patch.
>>>
>>> It means current zswap has a problem in multiple swap but you want
>>> to fix a problem which happens only when it is used for multiple swap.
>>> So, I'm not sure we want a fix in this phase before discussing more
>>> fundamental thing.
>>>
>>
>> Yes, The bug which I want to fix only happens when multiple swap are used.
>>
>>> That's why I want to know why you want to use multiple swap with zswap
>>> but you are never saying it to us. :(
>>>
>>
>> If user uses more than one swap device/file, then this is an issue.
>> Zswap pool is created when a swap device/file is swapped on happens.
>> So there will be more than one zswap pool when user uses 2 or even
>> more swap devices/files.
>>
>> I am not sure whether multiple swap are popular. But if multiple swap
>> are swapped
>> on, then multiple zswap pool will be created. And the size of these pools may
>> out of control.
>
> Personally I don't think using multiple swap partitions/files has to
> be popular to need to solve this, it only needs to be possible, which
> it is.
>
> Why not just leave zbud unchanged, and sum up the total size using a
> list of active zswap_trees as Minchan suggested for the v1 patch? The

Yes. This is what I want to do in the v3 patch after this bug is considered need
to be fixed.

Thanks.

> debugfs_create_u64("pool_pages") will probably need to be changed to
> debugfs_create_file() with a read function that calls the function to
> sum up the total.
>
>
>>
>> Thanks.
>>
>>> >
>>> > Thanks.
>>> >
>>> > >
>>> > >>
>>> > >> Thanks.
>>> > >>
>>> > >> > Please say your usecase scenario and if it's really problem,
>>> > >> > we need more surgery.
>>> > >> >
>>> > >> > Thanks.
>>> > >> >
>>> > >> >> For *pool->pages_nr*, one way is to use pool->lock to protect. But I
>>> > >> >> think it is too heavy.
>>> > >> >> So does it ok to change pages_nr to atomic type too?
>>> > >> >>
>>> > >> >>
>>> > >> >> >
>>> > >> >> >> + }
>>> > >> >> >> +
>>> > >> >> >> + return page;
>>> > >> >> >> +}
>>> > >> >> >> +
>>> > >> >> >> +
>>> > >> >> >> /* Resets the struct page fields and frees the page */
>>> > >> >> >> -static void free_zbud_page(struct zbud_header *zhdr)
>>> > >> >> >> +static void free_zbud_page(struct zbud_pool *pool, struct zbud_header *zhdr)
>>> > >> >> >> {
>>> > >> >> >> __free_page(virt_to_page(zhdr));
>>> > >> >> >> +
>>> > >> >> >> + pool->pages_nr--;
>>> > >> >> >> + total_zbud_pages--;
>>> > >> >> >> }
>>> > >> >> >>
>>> > >> >> >> /*
>>> > >> >> >> @@ -279,11 +304,10 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
>>> > >> >> >>
>>> > >> >> >> /* Couldn't find unbuddied zbud page, create new one */
>>> > >> >> >> spin_unlock(&pool->lock);
>>> > >> >> >> - page = alloc_page(gfp);
>>> > >> >> >> + page = alloc_zbud_page(pool, gfp);
>>> > >> >> >> if (!page)
>>> > >> >> >> return -ENOMEM;
>>> > >> >> >> spin_lock(&pool->lock);
>>> > >> >> >> - pool->pages_nr++;
>>> > >> >> >> zhdr = init_zbud_page(page);
>>> > >> >> >> bud = FIRST;
>>> > >> >> >>
>>> > >> >> >> @@ -349,8 +373,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
>>> > >> >> >> if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
>>> > >> >> >> /* zbud page is empty, free */
>>> > >> >> >> list_del(&zhdr->lru);
>>> > >> >> >> - free_zbud_page(zhdr);
>>> > >> >> >> - pool->pages_nr--;
>>> > >> >> >> + free_zbud_page(pool, zhdr);
>>> > >> >> >> } else {
>>> > >> >> >> /* Add to unbuddied list */
>>> > >> >> >> freechunks = num_free_chunks(zhdr);
>>> > >> >> >> @@ -447,8 +470,7 @@ next:
>>> > >> >> >> * Both buddies are now free, free the zbud page and
>>> > >> >> >> * return success.
>>> > >> >> >> */
>>> > >> >> >> - free_zbud_page(zhdr);
>>> > >> >> >> - pool->pages_nr--;
>>> > >> >> >> + free_zbud_page(pool, zhdr);
>>> > >> >> >> spin_unlock(&pool->lock);
>>> > >> >> >> return 0;
>>> > >> >> >> } else if (zhdr->first_chunks == 0 ||
>>> > >> >> >> @@ -496,14 +518,12 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
>>> > >> >> >>
>>> > >> >> >> /**
>>> > >> >> >> * zbud_get_pool_size() - gets the zbud pool size in pages
>>> > >> >> >> - * @pool: pool whose size is being queried
>>> > >> >> >> *
>>> > >> >> >> - * Returns: size in pages of the given pool. The pool lock need not be
>>> > >> >> >> - * taken to access pages_nr.
>>> > >> >> >> + * Returns: size in pages of all the zbud pools.
>>> > >> >> >> */
>>> > >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool)
>>> > >> >> >> +u64 zbud_get_pool_size(void)
>>> > >> >> >> {
>>> > >> >> >> - return pool->pages_nr;
>>> > >> >> >> + return total_zbud_pages;
>>> > >> >> >> }
>>> > >> >> >>
>>> > >> >> >> static int __init init_zbud(void)
>>> > >> >> >> diff --git a/mm/zswap.c b/mm/zswap.c
>>> > >> >> >> index 5a63f78..ef44d9d 100644
>>> > >> >> >> --- a/mm/zswap.c
>>> > >> >> >> +++ b/mm/zswap.c
>>> > >> >> >> @@ -291,7 +291,7 @@ static void zswap_free_entry(struct zswap_tree *tree,
>>> > >> >> >> zbud_free(tree->pool, entry->handle);
>>> > >> >> >> zswap_entry_cache_free(entry);
>>> > >> >> >> atomic_dec(&zswap_stored_pages);
>>> > >> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>>> > >> >> >> + zswap_pool_pages = zbud_get_pool_size();
>>> > >> >> >> }
>>> > >> >> >>
>>> > >> >> >> /* caller must hold the tree lock */
>>> > >> >> >> @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>>> > >> >> >>
>>> > >> >> >> /* update stats */
>>> > >> >> >> atomic_inc(&zswap_stored_pages);
>>> > >> >> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>>> > >> >> >> + zswap_pool_pages = zbud_get_pool_size();
>>> > >> >> >>
>>> > >> >> >> return 0;
>>> > >> >> >>
>>> > >> >> >> --
>>> > >> >> >> 1.7.10.4
>>> > >> >> >>
>>> > >> >> >> --
>>> > >> >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>> > >> >> >> the body to [email protected]. For more info on Linux MM,
>>> > >> >> >> see: http://www.linux-mm.org/ .
>>> > >> >> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>>> > >> >> >
>>> > >> >> > --
>>> > >> >> > Kind regards,
>>> > >> >> > Minchan Kim
>>> > >> >>
>>> > >> >> --
>>> > >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>> > >> >> the body to [email protected]. For more info on Linux MM,
>>> > >> >> see: http://www.linux-mm.org/ .
>>> > >> >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>>> > >> >
>>> > >> > --
>>> > >> > Kind regards,
>>> > >> > Minchan Kim
>>> > >>
>>> > >> --
>>> > >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>> > >> the body to [email protected]. For more info on Linux MM,
>>> > >> see: http://www.linux-mm.org/ .
>>> > >> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>>> > >
>>> > > --
>>> > > Kind regards,
>>> > > Minchan Kim
>>> >
>>> > --
>>> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>> > the body to [email protected]. For more info on Linux MM,
>>> > see: http://www.linux-mm.org/ .
>>> > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>>>
>>> --
>>> Kind regards,
>>> Minchan Kim
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to [email protected]. For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2014-01-23 03:01:28

by Minchan Kim

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

Hello Cai,

On Thu, Jan 23, 2014 at 09:38:41AM +0800, Cai Liu wrote:
> Hello Dan
>
> 2014/1/22 Dan Streetman <[email protected]>:
> > On Wed, Jan 22, 2014 at 7:16 AM, Cai Liu <[email protected]> wrote:
> >> Hello Minchan
> >>
> >>
> >> 2014/1/22 Minchan Kim <[email protected]>
> >>>
> >>> Hello Cai,
> >>>
> >>> On Tue, Jan 21, 2014 at 09:52:25PM +0800, Cai Liu wrote:
> >>> > Hello Minchan
> >>> >
> >>> > 2014/1/21 Minchan Kim <[email protected]>:
> >>> > > Hello,
> >>> > >
> >>> > > On Tue, Jan 21, 2014 at 02:35:07PM +0800, Cai Liu wrote:
> >>> > >> 2014/1/21 Minchan Kim <[email protected]>:
> >>> > >> > Please check your MUA and don't break thread.
> >>> > >> >
> >>> > >> > On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
> >>> > >> >> Thanks for your review.
> >>> > >> >>
> >>> > >> >> 2014/1/21 Minchan Kim <[email protected]>:
> >>> > >> >> > Hello Cai,
> >>> > >> >> >
> >>> > >> >> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
> >>> > >> >> >> zswap can support multiple swapfiles. So we need to check
> >>> > >> >> >> all zbud pool pages in zswap.
> >>> > >> >> >>
> >>> > >> >> >> Version 2:
> >>> > >> >> >> * add *total_zbud_pages* in zbud to record all the pages in pools
> >>> > >> >> >> * move the updating of pool pages statistics to
> >>> > >> >> >> alloc_zbud_page/free_zbud_page to hide the details
> >>> > >> >> >>
> >>> > >> >> >> Signed-off-by: Cai Liu <[email protected]>
> >>> > >> >> >> ---
> >>> > >> >> >> include/linux/zbud.h | 2 +-
> >>> > >> >> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
> >>> > >> >> >> mm/zswap.c | 4 ++--
> >>> > >> >> >> 3 files changed, 35 insertions(+), 15 deletions(-)
> >>> > >> >> >>
> >>> > >> >> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
> >>> > >> >> >> index 2571a5c..1dbc13e 100644
> >>> > >> >> >> --- a/include/linux/zbud.h
> >>> > >> >> >> +++ b/include/linux/zbud.h
> >>> > >> >> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
> >>> > >> >> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
> >>> > >> >> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
> >>> > >> >> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
> >>> > >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
> >>> > >> >> >> +u64 zbud_get_pool_size(void);
> >>> > >> >> >>
> >>> > >> >> >> #endif /* _ZBUD_H_ */
> >>> > >> >> >> diff --git a/mm/zbud.c b/mm/zbud.c
> >>> > >> >> >> index 9451361..711aaf4 100644
> >>> > >> >> >> --- a/mm/zbud.c
> >>> > >> >> >> +++ b/mm/zbud.c
> >>> > >> >> >> @@ -52,6 +52,13 @@
> >>> > >> >> >> #include <linux/spinlock.h>
> >>> > >> >> >> #include <linux/zbud.h>
> >>> > >> >> >>
> >>> > >> >> >> +/*********************************
> >>> > >> >> >> +* statistics
> >>> > >> >> >> +**********************************/
> >>> > >> >> >> +
> >>> > >> >> >> +/* zbud pages in all pools */
> >>> > >> >> >> +static u64 total_zbud_pages;
> >>> > >> >> >> +
> >>> > >> >> >> /*****************
> >>> > >> >> >> * Structures
> >>> > >> >> >> *****************/
> >>> > >> >> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
> >>> > >> >> >> return zhdr;
> >>> > >> >> >> }
> >>> > >> >> >>
> >>> > >> >> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
> >>> > >> >> >> +{
> >>> > >> >> >> + struct page *page;
> >>> > >> >> >> +
> >>> > >> >> >> + page = alloc_page(gfp);
> >>> > >> >> >> +
> >>> > >> >> >> + if (page) {
> >>> > >> >> >> + pool->pages_nr++;
> >>> > >> >> >> + total_zbud_pages++;
> >>> > >> >> >
> >>> > >> >> > Who protect race?
> >>> > >> >>
> >>> > >> >> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
> >>> > >> >> I will re-do it.
> >>> > >> >>
> >>> > >> >> I will change *total_zbud_pages* to atomic type.
> >>> > >> >
> >>> > >> > Wait, it doesn't make sense. Now, you assume zbud allocator would be used
> >>> > >> > for only zswap. It's true until now but we couldn't make sure it in future.
> >>> > >> > If other user start to use zbud allocator, total_zbud_pages would be pointless.
> >>> > >>
> >>> > >> Yes, you are right. ZBUD is a common module. So in this patch calculate the
> >>> > >> zswap pool size in zbud is not suitable.
> >>> > >>
> >>> > >> >
> >>> > >> > Another concern is that what's your scenario for above two swap?
> >>> > >> > How often we need to call zbud_get_pool_size?
> >>> > >> > In previous your patch, you reduced the number of call so IIRC,
> >>> > >> > we only called it in zswap_is_full and for debugfs.
> >>> > >>
> >>> > >> zbud_get_pool_size() is called frequently when adding/freeing zswap
> >>> > >> entry happen in zswap . This is why in this patch I added a counter in zbud,
> >>> > >> and then in zswap the iteration of zswap_list to calculate the pool size will
> >>> > >> not be needed.
> >>> > >
> >>> > > We can remove updating zswap_pool_pages in zswap_frontswap_store and
> >>> > > zswap_free_entry as I said. So zswap_is_full is only hot spot.
> >>> > > Do you think it's still big overhead? Why? Maybe locking to prevent
> >>> > > destroying? Then, we can use RCU to minimize the overhead as I mentioned.
> >>> >
> >>> > I get your point. Yes, In my previous patch, zswap_is_full() was the
> >>> > only path to call
> >>> > zbud_get_pool_size(). And your suggestion on patch v1 to remove the unnecessary
> >>> > iteration will reduce the overhead further.
> >>> >
> >>> > So adding the calculating of all the pool size in zswap.c is better.
> >>> >
> >>> > >>
> >>> > >> > Of course, it would need some lock or refcount to prevent destroy
> >>> > >> > of zswap_tree in parallel with zswap_frontswap_invalidate_area but
> >>> > >> > zswap_is_full doesn't need to be exact so RCU would be good fit.
> >>> > >> >
> >>> > >> > Most important point is that now zswap doesn't consider multiple swap.
> >>> > >> > For example, Let's assume you uses two swap A and B with different priority
> >>> > >> > and A already has charged 19% long time ago and let's assume that A swap is
> >>> > >> > full now so VM start to use B so that B has charged 1% recently.
> >>> > >> > It menas zswap charged (19% + 1%)i is full by default.
> >>> > >> >
> >>> > >> > Then, if VM want to swap out more pages into B, zbud_reclaim_page
> >>> > >> > would be evict one of pages in B's pool and it would be repeated
> >>> > >> > continuously. It's totally LRU reverse problem and swap thrashing in B
> >>> > >> > would happen.
> >>> > >> >
> >>> > >>
> >>> > >> The scenario is below:
> >>> > >> There are 2 swap A, B in system. If pool size of A reach 19% of ram
> >>> > >> size and swap A
> >>> > >> is also full. Then swap B will be used. Pool size of B will be
> >>> > >> increased until it hit
> >>> > >> the 20% of the ram size. By now zswap pool size is about 39% of ram size.
> >>> > >> If there are more than 2 swap file/device, zswap pool will expand out
> >>> > >> of control
> >>> > >> and there may be no swapout happened.
> >>> > >
> >>> > > I know.
> >>> > >
> >>> > >>
> >>> > >> I think the original intention of zswap designer is to keep the total
> >>> > >> zswap pools size below
> >>> > >> 20% of RAM size.
> >>> > >
> >>> > > My point is your patch still doesn't solve the example I mentioned.
> >>> >
> >>> > Hmm. My patch only make sure all the zswap pools use maximum 20% of
> >>> > RAM size. It is a new problem in your example. The zbud_reclaim_page would
> >>> > not swap out the oldest zbud page when multiple swaps are used.
> >>> >
> >>> > Maybe the new problem can be resolved in another patch.
> >>>
> >>> It means current zswap has a problem in multiple swap but you want
> >>> to fix a problem which happens only when it is used for multiple swap.
> >>> So, I'm not sure we want a fix in this phase before discussing more
> >>> fundamental thing.
> >>>
> >>
> >> Yes, The bug which I want to fix only happens when multiple swap are used.
> >>
> >>> That's why I want to know why you want to use multiple swap with zswap
> >>> but you are never saying it to us. :(
> >>>
> >>
> >> If user uses more than one swap device/file, then this is an issue.
> >> Zswap pool is created when a swap device/file is swapped on happens.
> >> So there will be more than one zswap pool when user uses 2 or even
> >> more swap devices/files.
> >>
> >> I am not sure whether multiple swap are popular. But if multiple swap
> >> are swapped
> >> on, then multiple zswap pool will be created. And the size of these pools may
> >> out of control.
> >
> > Personally I don't think using multiple swap partitions/files has to
> > be popular to need to solve this, it only needs to be possible, which
> > it is.
> >
> > Why not just leave zbud unchanged, and sum up the total size using a
> > list of active zswap_trees as Minchan suggested for the v1 patch? The
>
> Yes. This is what I want to do in the v3 patch after this bug is considered need
> to be fixed.

In my position, I'd like to fix zswap and multiple swap problem firstly
and like the Weijie's suggestion.

So, how about this?
I didn't look at code in detail and want to show the concept.
That's why I added RFC tag.

>From 67c64746e977a091ee30ca37bbc034990adf5ca5 Mon Sep 17 00:00:00 2001
From: Minchan Kim <[email protected]>
Date: Thu, 23 Jan 2014 11:41:44 +0900
Subject: [RFC] zswap: support multiple swap

Cai Liu reporeted that now zbud pool pages counting has a problem
when multiple swap is used because it just counts one of swap
among mutliple swap intead of all of swap so zswap cannot control
writeback properly. The result is unnecessary writeback or
no writeback when we should really writeback. IOW, it made zswap
crazy.

Another problem in zswap is following as.
For example, let's assume we use two swap A and B with different
priority and A already has charged 19% long time ago and let's assume
that A swap is full now so VM start to use B so that B has charged 1%
recently. It menas zswap charged (19% + 1%) is full by default.
Then, if VM want to swap out more pages into B, zbud_reclaim_page
would be evict one of pages in B's pool and it would be repeated
continuously. It's totally LRU reverse problem and swap thrashing
in B would happen.

This patch makes zswap consider mutliple swap by creating *a* zbud
pool which will be shared by multiple swap so all of zswap pages
in multiple swap keep order by LRU so it can prevent above two
problems.

Reported-by: Cai Liu <[email protected]>
Suggested-by: Weijie Yang <[email protected]>
Signed-off-by: Minchan Kim <[email protected]>
---
mm/zswap.c | 56 +++++++++++++++++++++++++++++---------------------------
1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 5a63f78a5601..96039e86db79 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -89,6 +89,8 @@ static unsigned int zswap_max_pool_percent = 20;
module_param_named(max_pool_percent,
zswap_max_pool_percent, uint, 0644);

+static struct zbud_pool *mem_pool;
+
/*********************************
* compression functions
**********************************/
@@ -189,7 +191,6 @@ struct zswap_header {
struct zswap_tree {
struct rb_root rbroot;
spinlock_t lock;
- struct zbud_pool *pool;
};

static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
@@ -288,10 +289,10 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
static void zswap_free_entry(struct zswap_tree *tree,
struct zswap_entry *entry)
{
- zbud_free(tree->pool, entry->handle);
+ zbud_free(mem_pool, entry->handle);
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
- zswap_pool_pages = zbud_get_pool_size(tree->pool);
+ zswap_pool_pages = zbud_get_pool_size(mem_pool);
}

/* caller must hold the tree lock */
@@ -545,7 +546,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
zbud_unmap(pool, handle);
tree = zswap_trees[swp_type(swpentry)];
offset = swp_offset(swpentry);
- BUG_ON(pool != tree->pool);
+ BUG_ON(pool != mem_pool);

/* find and ref zswap entry */
spin_lock(&tree->lock);
@@ -573,13 +574,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zbud_map(tree->pool, entry->handle) +
+ src = (u8 *)zbud_map(mem_pool, entry->handle) +
sizeof(struct zswap_header);
dst = kmap_atomic(page);
ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
entry->length, dst, &dlen);
kunmap_atomic(dst);
- zbud_unmap(tree->pool, entry->handle);
+ zbud_unmap(mem_pool, entry->handle);
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);

@@ -652,7 +653,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* reclaim space if needed */
if (zswap_is_full()) {
zswap_pool_limit_hit++;
- if (zbud_reclaim_page(tree->pool, 8)) {
+ if (zbud_reclaim_page(mem_pool, 8)) {
zswap_reject_reclaim_fail++;
ret = -ENOMEM;
goto reject;
@@ -679,7 +680,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,

/* store */
len = dlen + sizeof(struct zswap_header);
- ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
+ ret = zbud_alloc(mem_pool, len, __GFP_NORETRY | __GFP_NOWARN,
&handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
@@ -689,11 +690,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
zswap_reject_alloc_fail++;
goto freepage;
}
- zhdr = zbud_map(tree->pool, handle);
+ zhdr = zbud_map(mem_pool, handle);
zhdr->swpentry = swp_entry(type, offset);
buf = (u8 *)(zhdr + 1);
memcpy(buf, dst, dlen);
- zbud_unmap(tree->pool, handle);
+ zbud_unmap(mem_pool, handle);
put_cpu_var(zswap_dstmem);

/* populate entry */
@@ -716,7 +717,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,

/* update stats */
atomic_inc(&zswap_stored_pages);
- zswap_pool_pages = zbud_get_pool_size(tree->pool);
+ zswap_pool_pages = zbud_get_pool_size(mem_pool);

return 0;

@@ -752,13 +753,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,

/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zbud_map(tree->pool, entry->handle) +
+ src = (u8 *)zbud_map(mem_pool, entry->handle) +
sizeof(struct zswap_header);
dst = kmap_atomic(page);
ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
dst, &dlen);
kunmap_atomic(dst);
- zbud_unmap(tree->pool, entry->handle);
+ zbud_unmap(mem_pool, entry->handle);
BUG_ON(ret);

spin_lock(&tree->lock);
@@ -807,8 +808,6 @@ static void zswap_frontswap_invalidate_area(unsigned type)
zswap_free_entry(tree, entry);
tree->rbroot = RB_ROOT;
spin_unlock(&tree->lock);
-
- zbud_destroy_pool(tree->pool);
kfree(tree);
zswap_trees[type] = NULL;
}
@@ -822,20 +821,14 @@ static void zswap_frontswap_init(unsigned type)
struct zswap_tree *tree;

tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
- if (!tree)
- goto err;
- tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
- if (!tree->pool)
- goto freetree;
+ if (!tree) {
+ pr_err("alloc failed, zswap disabled for swap type %d\n", type);
+ return;
+ }
+
tree->rbroot = RB_ROOT;
spin_lock_init(&tree->lock);
zswap_trees[type] = tree;
- return;
-
-freetree:
- kfree(tree);
-err:
- pr_err("alloc failed, zswap disabled for swap type %d\n", type);
}

static struct frontswap_ops zswap_frontswap_ops = {
@@ -907,9 +900,14 @@ static int __init init_zswap(void)
return 0;

pr_info("loading zswap\n");
+
+ mem_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
+ if (!mem_pool)
+ goto error;
+
if (zswap_entry_cache_create()) {
pr_err("entry cache creation failed\n");
- goto error;
+ goto cachefail;
}
if (zswap_comp_init()) {
pr_err("compressor initialization failed\n");
@@ -919,6 +917,8 @@ static int __init init_zswap(void)
pr_err("per-cpu initialization failed\n");
goto pcpufail;
}
+
+
frontswap_register_ops(&zswap_frontswap_ops);
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
@@ -927,6 +927,8 @@ pcpufail:
zswap_comp_exit();
compfail:
zswap_entry_cache_destory();
+cachefail:
+ zbud_destroy_pool(mem_pool);
error:
return -ENOMEM;
}
--
1.8.5.2


--
Kind regards,
Minchan Kim

2014-01-23 06:30:19

by Cai Liu

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

Hello Minchan

2014/1/23 Minchan Kim <[email protected]>:
> Hello Cai,
>
> On Thu, Jan 23, 2014 at 09:38:41AM +0800, Cai Liu wrote:
>> Hello Dan
>>
>> 2014/1/22 Dan Streetman <[email protected]>:
>> > On Wed, Jan 22, 2014 at 7:16 AM, Cai Liu <[email protected]> wrote:
>> >> Hello Minchan
>> >>
>> >>
>> >> 2014/1/22 Minchan Kim <[email protected]>
>> >>>
>> >>> Hello Cai,
>> >>>
>> >>> On Tue, Jan 21, 2014 at 09:52:25PM +0800, Cai Liu wrote:
>> >>> > Hello Minchan
>> >>> >
>> >>> > 2014/1/21 Minchan Kim <[email protected]>:
>> >>> > > Hello,
>> >>> > >
>> >>> > > On Tue, Jan 21, 2014 at 02:35:07PM +0800, Cai Liu wrote:
>> >>> > >> 2014/1/21 Minchan Kim <[email protected]>:
>> >>> > >> > Please check your MUA and don't break thread.
>> >>> > >> >
>> >>> > >> > On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
>> >>> > >> >> Thanks for your review.
>> >>> > >> >>
>> >>> > >> >> 2014/1/21 Minchan Kim <[email protected]>:
>> >>> > >> >> > Hello Cai,
>> >>> > >> >> >
>> >>> > >> >> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
>> >>> > >> >> >> zswap can support multiple swapfiles. So we need to check
>> >>> > >> >> >> all zbud pool pages in zswap.
>> >>> > >> >> >>
>> >>> > >> >> >> Version 2:
>> >>> > >> >> >> * add *total_zbud_pages* in zbud to record all the pages in pools
>> >>> > >> >> >> * move the updating of pool pages statistics to
>> >>> > >> >> >> alloc_zbud_page/free_zbud_page to hide the details
>> >>> > >> >> >>
>> >>> > >> >> >> Signed-off-by: Cai Liu <[email protected]>
>> >>> > >> >> >> ---
>> >>> > >> >> >> include/linux/zbud.h | 2 +-
>> >>> > >> >> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
>> >>> > >> >> >> mm/zswap.c | 4 ++--
>> >>> > >> >> >> 3 files changed, 35 insertions(+), 15 deletions(-)
>> >>> > >> >> >>
>> >>> > >> >> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
>> >>> > >> >> >> index 2571a5c..1dbc13e 100644
>> >>> > >> >> >> --- a/include/linux/zbud.h
>> >>> > >> >> >> +++ b/include/linux/zbud.h
>> >>> > >> >> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
>> >>> > >> >> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
>> >>> > >> >> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
>> >>> > >> >> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
>> >>> > >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
>> >>> > >> >> >> +u64 zbud_get_pool_size(void);
>> >>> > >> >> >>
>> >>> > >> >> >> #endif /* _ZBUD_H_ */
>> >>> > >> >> >> diff --git a/mm/zbud.c b/mm/zbud.c
>> >>> > >> >> >> index 9451361..711aaf4 100644
>> >>> > >> >> >> --- a/mm/zbud.c
>> >>> > >> >> >> +++ b/mm/zbud.c
>> >>> > >> >> >> @@ -52,6 +52,13 @@
>> >>> > >> >> >> #include <linux/spinlock.h>
>> >>> > >> >> >> #include <linux/zbud.h>
>> >>> > >> >> >>
>> >>> > >> >> >> +/*********************************
>> >>> > >> >> >> +* statistics
>> >>> > >> >> >> +**********************************/
>> >>> > >> >> >> +
>> >>> > >> >> >> +/* zbud pages in all pools */
>> >>> > >> >> >> +static u64 total_zbud_pages;
>> >>> > >> >> >> +
>> >>> > >> >> >> /*****************
>> >>> > >> >> >> * Structures
>> >>> > >> >> >> *****************/
>> >>> > >> >> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
>> >>> > >> >> >> return zhdr;
>> >>> > >> >> >> }
>> >>> > >> >> >>
>> >>> > >> >> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
>> >>> > >> >> >> +{
>> >>> > >> >> >> + struct page *page;
>> >>> > >> >> >> +
>> >>> > >> >> >> + page = alloc_page(gfp);
>> >>> > >> >> >> +
>> >>> > >> >> >> + if (page) {
>> >>> > >> >> >> + pool->pages_nr++;
>> >>> > >> >> >> + total_zbud_pages++;
>> >>> > >> >> >
>> >>> > >> >> > Who protect race?
>> >>> > >> >>
>> >>> > >> >> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
>> >>> > >> >> I will re-do it.
>> >>> > >> >>
>> >>> > >> >> I will change *total_zbud_pages* to atomic type.
>> >>> > >> >
>> >>> > >> > Wait, it doesn't make sense. Now, you assume zbud allocator would be used
>> >>> > >> > for only zswap. It's true until now but we couldn't make sure it in future.
>> >>> > >> > If other user start to use zbud allocator, total_zbud_pages would be pointless.
>> >>> > >>
>> >>> > >> Yes, you are right. ZBUD is a common module. So in this patch calculate the
>> >>> > >> zswap pool size in zbud is not suitable.
>> >>> > >>
>> >>> > >> >
>> >>> > >> > Another concern is that what's your scenario for above two swap?
>> >>> > >> > How often we need to call zbud_get_pool_size?
>> >>> > >> > In previous your patch, you reduced the number of call so IIRC,
>> >>> > >> > we only called it in zswap_is_full and for debugfs.
>> >>> > >>
>> >>> > >> zbud_get_pool_size() is called frequently when adding/freeing zswap
>> >>> > >> entry happen in zswap . This is why in this patch I added a counter in zbud,
>> >>> > >> and then in zswap the iteration of zswap_list to calculate the pool size will
>> >>> > >> not be needed.
>> >>> > >
>> >>> > > We can remove updating zswap_pool_pages in zswap_frontswap_store and
>> >>> > > zswap_free_entry as I said. So zswap_is_full is only hot spot.
>> >>> > > Do you think it's still big overhead? Why? Maybe locking to prevent
>> >>> > > destroying? Then, we can use RCU to minimize the overhead as I mentioned.
>> >>> >
>> >>> > I get your point. Yes, In my previous patch, zswap_is_full() was the
>> >>> > only path to call
>> >>> > zbud_get_pool_size(). And your suggestion on patch v1 to remove the unnecessary
>> >>> > iteration will reduce the overhead further.
>> >>> >
>> >>> > So adding the calculating of all the pool size in zswap.c is better.
>> >>> >
>> >>> > >>
>> >>> > >> > Of course, it would need some lock or refcount to prevent destroy
>> >>> > >> > of zswap_tree in parallel with zswap_frontswap_invalidate_area but
>> >>> > >> > zswap_is_full doesn't need to be exact so RCU would be good fit.
>> >>> > >> >
>> >>> > >> > Most important point is that now zswap doesn't consider multiple swap.
>> >>> > >> > For example, Let's assume you uses two swap A and B with different priority
>> >>> > >> > and A already has charged 19% long time ago and let's assume that A swap is
>> >>> > >> > full now so VM start to use B so that B has charged 1% recently.
>> >>> > >> > It menas zswap charged (19% + 1%)i is full by default.
>> >>> > >> >
>> >>> > >> > Then, if VM want to swap out more pages into B, zbud_reclaim_page
>> >>> > >> > would be evict one of pages in B's pool and it would be repeated
>> >>> > >> > continuously. It's totally LRU reverse problem and swap thrashing in B
>> >>> > >> > would happen.
>> >>> > >> >
>> >>> > >>
>> >>> > >> The scenario is below:
>> >>> > >> There are 2 swap A, B in system. If pool size of A reach 19% of ram
>> >>> > >> size and swap A
>> >>> > >> is also full. Then swap B will be used. Pool size of B will be
>> >>> > >> increased until it hit
>> >>> > >> the 20% of the ram size. By now zswap pool size is about 39% of ram size.
>> >>> > >> If there are more than 2 swap file/device, zswap pool will expand out
>> >>> > >> of control
>> >>> > >> and there may be no swapout happened.
>> >>> > >
>> >>> > > I know.
>> >>> > >
>> >>> > >>
>> >>> > >> I think the original intention of zswap designer is to keep the total
>> >>> > >> zswap pools size below
>> >>> > >> 20% of RAM size.
>> >>> > >
>> >>> > > My point is your patch still doesn't solve the example I mentioned.
>> >>> >
>> >>> > Hmm. My patch only make sure all the zswap pools use maximum 20% of
>> >>> > RAM size. It is a new problem in your example. The zbud_reclaim_page would
>> >>> > not swap out the oldest zbud page when multiple swaps are used.
>> >>> >
>> >>> > Maybe the new problem can be resolved in another patch.
>> >>>
>> >>> It means current zswap has a problem in multiple swap but you want
>> >>> to fix a problem which happens only when it is used for multiple swap.
>> >>> So, I'm not sure we want a fix in this phase before discussing more
>> >>> fundamental thing.
>> >>>
>> >>
>> >> Yes, The bug which I want to fix only happens when multiple swap are used.
>> >>
>> >>> That's why I want to know why you want to use multiple swap with zswap
>> >>> but you are never saying it to us. :(
>> >>>
>> >>
>> >> If user uses more than one swap device/file, then this is an issue.
>> >> Zswap pool is created when a swap device/file is swapped on happens.
>> >> So there will be more than one zswap pool when user uses 2 or even
>> >> more swap devices/files.
>> >>
>> >> I am not sure whether multiple swap are popular. But if multiple swap
>> >> are swapped
>> >> on, then multiple zswap pool will be created. And the size of these pools may
>> >> out of control.
>> >
>> > Personally I don't think using multiple swap partitions/files has to
>> > be popular to need to solve this, it only needs to be possible, which
>> > it is.
>> >
>> > Why not just leave zbud unchanged, and sum up the total size using a
>> > list of active zswap_trees as Minchan suggested for the v1 patch? The
>>
>> Yes. This is what I want to do in the v3 patch after this bug is considered need
>> to be fixed.
>
> In my position, I'd like to fix zswap and multiple swap problem firstly
> and like the Weijie's suggestion.
>
> So, how about this?
> I didn't look at code in detail and want to show the concept.

I read the RFC patch. I think it's perfect.

> That's why I added RFC tag.
>
> From 67c64746e977a091ee30ca37bbc034990adf5ca5 Mon Sep 17 00:00:00 2001
> From: Minchan Kim <[email protected]>
> Date: Thu, 23 Jan 2014 11:41:44 +0900
> Subject: [RFC] zswap: support multiple swap
>
> Cai Liu reporeted that now zbud pool pages counting has a problem
> when multiple swap is used because it just counts one of swap
> among mutliple swap intead of all of swap so zswap cannot control
> writeback properly. The result is unnecessary writeback or
> no writeback when we should really writeback. IOW, it made zswap
> crazy.
>
> Another problem in zswap is following as.
> For example, let's assume we use two swap A and B with different
> priority and A already has charged 19% long time ago and let's assume
> that A swap is full now so VM start to use B so that B has charged 1%
> recently. It menas zswap charged (19% + 1%) is full by default.
> Then, if VM want to swap out more pages into B, zbud_reclaim_page
> would be evict one of pages in B's pool and it would be repeated
> continuously. It's totally LRU reverse problem and swap thrashing
> in B would happen.
>
> This patch makes zswap consider mutliple swap by creating *a* zbud
> pool which will be shared by multiple swap so all of zswap pages
> in multiple swap keep order by LRU so it can prevent above two
> problems.
>
> Reported-by: Cai Liu <[email protected]>
> Suggested-by: Weijie Yang <[email protected]>
> Signed-off-by: Minchan Kim <[email protected]>
> ---
> mm/zswap.c | 56 +++++++++++++++++++++++++++++---------------------------
> 1 file changed, 29 insertions(+), 27 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 5a63f78a5601..96039e86db79 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -89,6 +89,8 @@ static unsigned int zswap_max_pool_percent = 20;
> module_param_named(max_pool_percent,
> zswap_max_pool_percent, uint, 0644);
>
> +static struct zbud_pool *mem_pool;
> +
> /*********************************
> * compression functions
> **********************************/
> @@ -189,7 +191,6 @@ struct zswap_header {
> struct zswap_tree {
> struct rb_root rbroot;
> spinlock_t lock;
> - struct zbud_pool *pool;
> };
>
> static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
> @@ -288,10 +289,10 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
> static void zswap_free_entry(struct zswap_tree *tree,
> struct zswap_entry *entry)
> {
> - zbud_free(tree->pool, entry->handle);
> + zbud_free(mem_pool, entry->handle);
> zswap_entry_cache_free(entry);
> atomic_dec(&zswap_stored_pages);
> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
> + zswap_pool_pages = zbud_get_pool_size(mem_pool);
> }
>
> /* caller must hold the tree lock */
> @@ -545,7 +546,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
> zbud_unmap(pool, handle);
> tree = zswap_trees[swp_type(swpentry)];
> offset = swp_offset(swpentry);
> - BUG_ON(pool != tree->pool);
> + BUG_ON(pool != mem_pool);
>
> /* find and ref zswap entry */
> spin_lock(&tree->lock);
> @@ -573,13 +574,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
> case ZSWAP_SWAPCACHE_NEW: /* page is locked */
> /* decompress */
> dlen = PAGE_SIZE;
> - src = (u8 *)zbud_map(tree->pool, entry->handle) +
> + src = (u8 *)zbud_map(mem_pool, entry->handle) +
> sizeof(struct zswap_header);
> dst = kmap_atomic(page);
> ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
> entry->length, dst, &dlen);
> kunmap_atomic(dst);
> - zbud_unmap(tree->pool, entry->handle);
> + zbud_unmap(mem_pool, entry->handle);
> BUG_ON(ret);
> BUG_ON(dlen != PAGE_SIZE);
>
> @@ -652,7 +653,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> /* reclaim space if needed */
> if (zswap_is_full()) {
> zswap_pool_limit_hit++;
> - if (zbud_reclaim_page(tree->pool, 8)) {
> + if (zbud_reclaim_page(mem_pool, 8)) {
> zswap_reject_reclaim_fail++;
> ret = -ENOMEM;
> goto reject;
> @@ -679,7 +680,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>
> /* store */
> len = dlen + sizeof(struct zswap_header);
> - ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
> + ret = zbud_alloc(mem_pool, len, __GFP_NORETRY | __GFP_NOWARN,
> &handle);
> if (ret == -ENOSPC) {
> zswap_reject_compress_poor++;
> @@ -689,11 +690,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> zswap_reject_alloc_fail++;
> goto freepage;
> }
> - zhdr = zbud_map(tree->pool, handle);
> + zhdr = zbud_map(mem_pool, handle);
> zhdr->swpentry = swp_entry(type, offset);
> buf = (u8 *)(zhdr + 1);
> memcpy(buf, dst, dlen);
> - zbud_unmap(tree->pool, handle);
> + zbud_unmap(mem_pool, handle);
> put_cpu_var(zswap_dstmem);
>
> /* populate entry */
> @@ -716,7 +717,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>
> /* update stats */
> atomic_inc(&zswap_stored_pages);
> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
> + zswap_pool_pages = zbud_get_pool_size(mem_pool);
>
> return 0;
>
> @@ -752,13 +753,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>
> /* decompress */
> dlen = PAGE_SIZE;
> - src = (u8 *)zbud_map(tree->pool, entry->handle) +
> + src = (u8 *)zbud_map(mem_pool, entry->handle) +
> sizeof(struct zswap_header);
> dst = kmap_atomic(page);
> ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
> dst, &dlen);
> kunmap_atomic(dst);
> - zbud_unmap(tree->pool, entry->handle);
> + zbud_unmap(mem_pool, entry->handle);
> BUG_ON(ret);
>
> spin_lock(&tree->lock);
> @@ -807,8 +808,6 @@ static void zswap_frontswap_invalidate_area(unsigned type)
> zswap_free_entry(tree, entry);
> tree->rbroot = RB_ROOT;
> spin_unlock(&tree->lock);
> -
> - zbud_destroy_pool(tree->pool);
> kfree(tree);
> zswap_trees[type] = NULL;
> }
> @@ -822,20 +821,14 @@ static void zswap_frontswap_init(unsigned type)
> struct zswap_tree *tree;
>
> tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
> - if (!tree)
> - goto err;
> - tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
> - if (!tree->pool)
> - goto freetree;
> + if (!tree) {
> + pr_err("alloc failed, zswap disabled for swap type %d\n", type);
> + return;
> + }
> +
> tree->rbroot = RB_ROOT;
> spin_lock_init(&tree->lock);
> zswap_trees[type] = tree;
> - return;
> -
> -freetree:
> - kfree(tree);
> -err:
> - pr_err("alloc failed, zswap disabled for swap type %d\n", type);
> }
>
> static struct frontswap_ops zswap_frontswap_ops = {
> @@ -907,9 +900,14 @@ static int __init init_zswap(void)
> return 0;
>
> pr_info("loading zswap\n");
> +
> + mem_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
> + if (!mem_pool)
> + goto error;
> +
> if (zswap_entry_cache_create()) {
> pr_err("entry cache creation failed\n");
> - goto error;
> + goto cachefail;
> }
> if (zswap_comp_init()) {
> pr_err("compressor initialization failed\n");
> @@ -919,6 +917,8 @@ static int __init init_zswap(void)
> pr_err("per-cpu initialization failed\n");
> goto pcpufail;
> }
> +
> +
> frontswap_register_ops(&zswap_frontswap_ops);
> if (zswap_debugfs_init())
> pr_warn("debugfs initialization failed\n");
> @@ -927,6 +927,8 @@ pcpufail:
> zswap_comp_exit();
> compfail:
> zswap_entry_cache_destory();
> +cachefail:
> + zbud_destroy_pool(mem_pool);
> error:
> return -ENOMEM;
> }
> --
> 1.8.5.2
>
>
> --
> Kind regards,
> Minchan Kim

2014-01-24 14:20:39

by Weijie Yang

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

On Thu, Jan 23, 2014 at 2:30 PM, Cai Liu <[email protected]> wrote:
> Hello Minchan
>
> 2014/1/23 Minchan Kim <[email protected]>:
>> Hello Cai,
>>
>> On Thu, Jan 23, 2014 at 09:38:41AM +0800, Cai Liu wrote:
>>> Hello Dan
>>>
>>> 2014/1/22 Dan Streetman <[email protected]>:
>>> > On Wed, Jan 22, 2014 at 7:16 AM, Cai Liu <[email protected]> wrote:
>>> >> Hello Minchan
>>> >>
>>> >>
>>> >> 2014/1/22 Minchan Kim <[email protected]>
>>> >>>
>>> >>> Hello Cai,
>>> >>>
>>> >>> On Tue, Jan 21, 2014 at 09:52:25PM +0800, Cai Liu wrote:
>>> >>> > Hello Minchan
>>> >>> >
>>> >>> > 2014/1/21 Minchan Kim <[email protected]>:
>>> >>> > > Hello,
>>> >>> > >
>>> >>> > > On Tue, Jan 21, 2014 at 02:35:07PM +0800, Cai Liu wrote:
>>> >>> > >> 2014/1/21 Minchan Kim <[email protected]>:
>>> >>> > >> > Please check your MUA and don't break thread.
>>> >>> > >> >
>>> >>> > >> > On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
>>> >>> > >> >> Thanks for your review.
>>> >>> > >> >>
>>> >>> > >> >> 2014/1/21 Minchan Kim <[email protected]>:
>>> >>> > >> >> > Hello Cai,
>>> >>> > >> >> >
>>> >>> > >> >> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
>>> >>> > >> >> >> zswap can support multiple swapfiles. So we need to check
>>> >>> > >> >> >> all zbud pool pages in zswap.
>>> >>> > >> >> >>
>>> >>> > >> >> >> Version 2:
>>> >>> > >> >> >> * add *total_zbud_pages* in zbud to record all the pages in pools
>>> >>> > >> >> >> * move the updating of pool pages statistics to
>>> >>> > >> >> >> alloc_zbud_page/free_zbud_page to hide the details
>>> >>> > >> >> >>
>>> >>> > >> >> >> Signed-off-by: Cai Liu <[email protected]>
>>> >>> > >> >> >> ---
>>> >>> > >> >> >> include/linux/zbud.h | 2 +-
>>> >>> > >> >> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
>>> >>> > >> >> >> mm/zswap.c | 4 ++--
>>> >>> > >> >> >> 3 files changed, 35 insertions(+), 15 deletions(-)
>>> >>> > >> >> >>
>>> >>> > >> >> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
>>> >>> > >> >> >> index 2571a5c..1dbc13e 100644
>>> >>> > >> >> >> --- a/include/linux/zbud.h
>>> >>> > >> >> >> +++ b/include/linux/zbud.h
>>> >>> > >> >> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
>>> >>> > >> >> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
>>> >>> > >> >> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
>>> >>> > >> >> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
>>> >>> > >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
>>> >>> > >> >> >> +u64 zbud_get_pool_size(void);
>>> >>> > >> >> >>
>>> >>> > >> >> >> #endif /* _ZBUD_H_ */
>>> >>> > >> >> >> diff --git a/mm/zbud.c b/mm/zbud.c
>>> >>> > >> >> >> index 9451361..711aaf4 100644
>>> >>> > >> >> >> --- a/mm/zbud.c
>>> >>> > >> >> >> +++ b/mm/zbud.c
>>> >>> > >> >> >> @@ -52,6 +52,13 @@
>>> >>> > >> >> >> #include <linux/spinlock.h>
>>> >>> > >> >> >> #include <linux/zbud.h>
>>> >>> > >> >> >>
>>> >>> > >> >> >> +/*********************************
>>> >>> > >> >> >> +* statistics
>>> >>> > >> >> >> +**********************************/
>>> >>> > >> >> >> +
>>> >>> > >> >> >> +/* zbud pages in all pools */
>>> >>> > >> >> >> +static u64 total_zbud_pages;
>>> >>> > >> >> >> +
>>> >>> > >> >> >> /*****************
>>> >>> > >> >> >> * Structures
>>> >>> > >> >> >> *****************/
>>> >>> > >> >> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
>>> >>> > >> >> >> return zhdr;
>>> >>> > >> >> >> }
>>> >>> > >> >> >>
>>> >>> > >> >> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
>>> >>> > >> >> >> +{
>>> >>> > >> >> >> + struct page *page;
>>> >>> > >> >> >> +
>>> >>> > >> >> >> + page = alloc_page(gfp);
>>> >>> > >> >> >> +
>>> >>> > >> >> >> + if (page) {
>>> >>> > >> >> >> + pool->pages_nr++;
>>> >>> > >> >> >> + total_zbud_pages++;
>>> >>> > >> >> >
>>> >>> > >> >> > Who protect race?
>>> >>> > >> >>
>>> >>> > >> >> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
>>> >>> > >> >> I will re-do it.
>>> >>> > >> >>
>>> >>> > >> >> I will change *total_zbud_pages* to atomic type.
>>> >>> > >> >
>>> >>> > >> > Wait, it doesn't make sense. Now, you assume zbud allocator would be used
>>> >>> > >> > for only zswap. It's true until now but we couldn't make sure it in future.
>>> >>> > >> > If other user start to use zbud allocator, total_zbud_pages would be pointless.
>>> >>> > >>
>>> >>> > >> Yes, you are right. ZBUD is a common module. So in this patch calculate the
>>> >>> > >> zswap pool size in zbud is not suitable.
>>> >>> > >>
>>> >>> > >> >
>>> >>> > >> > Another concern is that what's your scenario for above two swap?
>>> >>> > >> > How often we need to call zbud_get_pool_size?
>>> >>> > >> > In previous your patch, you reduced the number of call so IIRC,
>>> >>> > >> > we only called it in zswap_is_full and for debugfs.
>>> >>> > >>
>>> >>> > >> zbud_get_pool_size() is called frequently when adding/freeing zswap
>>> >>> > >> entry happen in zswap . This is why in this patch I added a counter in zbud,
>>> >>> > >> and then in zswap the iteration of zswap_list to calculate the pool size will
>>> >>> > >> not be needed.
>>> >>> > >
>>> >>> > > We can remove updating zswap_pool_pages in zswap_frontswap_store and
>>> >>> > > zswap_free_entry as I said. So zswap_is_full is only hot spot.
>>> >>> > > Do you think it's still big overhead? Why? Maybe locking to prevent
>>> >>> > > destroying? Then, we can use RCU to minimize the overhead as I mentioned.
>>> >>> >
>>> >>> > I get your point. Yes, In my previous patch, zswap_is_full() was the
>>> >>> > only path to call
>>> >>> > zbud_get_pool_size(). And your suggestion on patch v1 to remove the unnecessary
>>> >>> > iteration will reduce the overhead further.
>>> >>> >
>>> >>> > So adding the calculating of all the pool size in zswap.c is better.
>>> >>> >
>>> >>> > >>
>>> >>> > >> > Of course, it would need some lock or refcount to prevent destroy
>>> >>> > >> > of zswap_tree in parallel with zswap_frontswap_invalidate_area but
>>> >>> > >> > zswap_is_full doesn't need to be exact so RCU would be good fit.
>>> >>> > >> >
>>> >>> > >> > Most important point is that now zswap doesn't consider multiple swap.
>>> >>> > >> > For example, Let's assume you uses two swap A and B with different priority
>>> >>> > >> > and A already has charged 19% long time ago and let's assume that A swap is
>>> >>> > >> > full now so VM start to use B so that B has charged 1% recently.
>>> >>> > >> > It menas zswap charged (19% + 1%)i is full by default.
>>> >>> > >> >
>>> >>> > >> > Then, if VM want to swap out more pages into B, zbud_reclaim_page
>>> >>> > >> > would be evict one of pages in B's pool and it would be repeated
>>> >>> > >> > continuously. It's totally LRU reverse problem and swap thrashing in B
>>> >>> > >> > would happen.
>>> >>> > >> >
>>> >>> > >>
>>> >>> > >> The scenario is below:
>>> >>> > >> There are 2 swap A, B in system. If pool size of A reach 19% of ram
>>> >>> > >> size and swap A
>>> >>> > >> is also full. Then swap B will be used. Pool size of B will be
>>> >>> > >> increased until it hit
>>> >>> > >> the 20% of the ram size. By now zswap pool size is about 39% of ram size.
>>> >>> > >> If there are more than 2 swap file/device, zswap pool will expand out
>>> >>> > >> of control
>>> >>> > >> and there may be no swapout happened.
>>> >>> > >
>>> >>> > > I know.
>>> >>> > >
>>> >>> > >>
>>> >>> > >> I think the original intention of zswap designer is to keep the total
>>> >>> > >> zswap pools size below
>>> >>> > >> 20% of RAM size.
>>> >>> > >
>>> >>> > > My point is your patch still doesn't solve the example I mentioned.
>>> >>> >
>>> >>> > Hmm. My patch only make sure all the zswap pools use maximum 20% of
>>> >>> > RAM size. It is a new problem in your example. The zbud_reclaim_page would
>>> >>> > not swap out the oldest zbud page when multiple swaps are used.
>>> >>> >
>>> >>> > Maybe the new problem can be resolved in another patch.
>>> >>>
>>> >>> It means current zswap has a problem in multiple swap but you want
>>> >>> to fix a problem which happens only when it is used for multiple swap.
>>> >>> So, I'm not sure we want a fix in this phase before discussing more
>>> >>> fundamental thing.
>>> >>>
>>> >>
>>> >> Yes, The bug which I want to fix only happens when multiple swap are used.
>>> >>
>>> >>> That's why I want to know why you want to use multiple swap with zswap
>>> >>> but you are never saying it to us. :(
>>> >>>
>>> >>
>>> >> If user uses more than one swap device/file, then this is an issue.
>>> >> Zswap pool is created when a swap device/file is swapped on happens.
>>> >> So there will be more than one zswap pool when user uses 2 or even
>>> >> more swap devices/files.
>>> >>
>>> >> I am not sure whether multiple swap are popular. But if multiple swap
>>> >> are swapped
>>> >> on, then multiple zswap pool will be created. And the size of these pools may
>>> >> out of control.
>>> >
>>> > Personally I don't think using multiple swap partitions/files has to
>>> > be popular to need to solve this, it only needs to be possible, which
>>> > it is.
>>> >
>>> > Why not just leave zbud unchanged, and sum up the total size using a
>>> > list of active zswap_trees as Minchan suggested for the v1 patch? The
>>>
>>> Yes. This is what I want to do in the v3 patch after this bug is considered need
>>> to be fixed.
>>
>> In my position, I'd like to fix zswap and multiple swap problem firstly
>> and like the Weijie's suggestion.
>>
>> So, how about this?
>> I didn't look at code in detail and want to show the concept.
>
> I read the RFC patch. I think it's perfect.
>
>> That's why I added RFC tag.
>>
>> From 67c64746e977a091ee30ca37bbc034990adf5ca5 Mon Sep 17 00:00:00 2001
>> From: Minchan Kim <[email protected]>
>> Date: Thu, 23 Jan 2014 11:41:44 +0900
>> Subject: [RFC] zswap: support multiple swap
>>
>> Cai Liu reporeted that now zbud pool pages counting has a problem
>> when multiple swap is used because it just counts one of swap
>> among mutliple swap intead of all of swap so zswap cannot control
>> writeback properly. The result is unnecessary writeback or
>> no writeback when we should really writeback. IOW, it made zswap
>> crazy.
>>
>> Another problem in zswap is following as.
>> For example, let's assume we use two swap A and B with different
>> priority and A already has charged 19% long time ago and let's assume
>> that A swap is full now so VM start to use B so that B has charged 1%
>> recently. It menas zswap charged (19% + 1%) is full by default.
>> Then, if VM want to swap out more pages into B, zbud_reclaim_page
>> would be evict one of pages in B's pool and it would be repeated
>> continuously. It's totally LRU reverse problem and swap thrashing
>> in B would happen.
>>
>> This patch makes zswap consider mutliple swap by creating *a* zbud
>> pool which will be shared by multiple swap so all of zswap pages
>> in multiple swap keep order by LRU so it can prevent above two
>> problems.
>>
>> Reported-by: Cai Liu <[email protected]>
>> Suggested-by: Weijie Yang <[email protected]>
>> Signed-off-by: Minchan Kim <[email protected]>
>> ---
>> mm/zswap.c | 56 +++++++++++++++++++++++++++++---------------------------
>> 1 file changed, 29 insertions(+), 27 deletions(-)

Hi, Minchan

I reviewed this patch, it is good to me. Just have a little nitpick, see below.

Regards

>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 5a63f78a5601..96039e86db79 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -89,6 +89,8 @@ static unsigned int zswap_max_pool_percent = 20;
>> module_param_named(max_pool_percent,
>> zswap_max_pool_percent, uint, 0644);
>>
>> +static struct zbud_pool *mem_pool;
>> +

nitpick1: I'd like to put the same logical code together.
such as put this mem_pool definition with zswap_trees and zswap_entry_cache
Just my oddity, of course you can ignore it.

>> /*********************************
>> * compression functions
>> **********************************/
>> @@ -189,7 +191,6 @@ struct zswap_header {
>> struct zswap_tree {
>> struct rb_root rbroot;
>> spinlock_t lock;
>> - struct zbud_pool *pool;
>> };
>>
>> static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
>> @@ -288,10 +289,10 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
>> static void zswap_free_entry(struct zswap_tree *tree,
>> struct zswap_entry *entry)

nitpick2: How about remove the tree parameter in zswap_free_entry?

>> {
>> - zbud_free(tree->pool, entry->handle);
>> + zbud_free(mem_pool, entry->handle);
>> zswap_entry_cache_free(entry);
>> atomic_dec(&zswap_stored_pages);
>> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>> + zswap_pool_pages = zbud_get_pool_size(mem_pool);
>> }
>>
>> /* caller must hold the tree lock */
>> @@ -545,7 +546,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
>> zbud_unmap(pool, handle);
>> tree = zswap_trees[swp_type(swpentry)];
>> offset = swp_offset(swpentry);
>> - BUG_ON(pool != tree->pool);
>> + BUG_ON(pool != mem_pool);
>>
>> /* find and ref zswap entry */
>> spin_lock(&tree->lock);
>> @@ -573,13 +574,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
>> case ZSWAP_SWAPCACHE_NEW: /* page is locked */
>> /* decompress */
>> dlen = PAGE_SIZE;
>> - src = (u8 *)zbud_map(tree->pool, entry->handle) +
>> + src = (u8 *)zbud_map(mem_pool, entry->handle) +
>> sizeof(struct zswap_header);
>> dst = kmap_atomic(page);
>> ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
>> entry->length, dst, &dlen);
>> kunmap_atomic(dst);
>> - zbud_unmap(tree->pool, entry->handle);
>> + zbud_unmap(mem_pool, entry->handle);
>> BUG_ON(ret);
>> BUG_ON(dlen != PAGE_SIZE);
>>
>> @@ -652,7 +653,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>> /* reclaim space if needed */
>> if (zswap_is_full()) {
>> zswap_pool_limit_hit++;
>> - if (zbud_reclaim_page(tree->pool, 8)) {
>> + if (zbud_reclaim_page(mem_pool, 8)) {
>> zswap_reject_reclaim_fail++;
>> ret = -ENOMEM;
>> goto reject;
>> @@ -679,7 +680,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>>
>> /* store */
>> len = dlen + sizeof(struct zswap_header);
>> - ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
>> + ret = zbud_alloc(mem_pool, len, __GFP_NORETRY | __GFP_NOWARN,
>> &handle);
>> if (ret == -ENOSPC) {
>> zswap_reject_compress_poor++;
>> @@ -689,11 +690,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>> zswap_reject_alloc_fail++;
>> goto freepage;
>> }
>> - zhdr = zbud_map(tree->pool, handle);
>> + zhdr = zbud_map(mem_pool, handle);
>> zhdr->swpentry = swp_entry(type, offset);
>> buf = (u8 *)(zhdr + 1);
>> memcpy(buf, dst, dlen);
>> - zbud_unmap(tree->pool, handle);
>> + zbud_unmap(mem_pool, handle);
>> put_cpu_var(zswap_dstmem);
>>
>> /* populate entry */
>> @@ -716,7 +717,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>>
>> /* update stats */
>> atomic_inc(&zswap_stored_pages);
>> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
>> + zswap_pool_pages = zbud_get_pool_size(mem_pool);
>>
>> return 0;
>>
>> @@ -752,13 +753,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>>
>> /* decompress */
>> dlen = PAGE_SIZE;
>> - src = (u8 *)zbud_map(tree->pool, entry->handle) +
>> + src = (u8 *)zbud_map(mem_pool, entry->handle) +
>> sizeof(struct zswap_header);
>> dst = kmap_atomic(page);
>> ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
>> dst, &dlen);
>> kunmap_atomic(dst);
>> - zbud_unmap(tree->pool, entry->handle);
>> + zbud_unmap(mem_pool, entry->handle);
>> BUG_ON(ret);
>>
>> spin_lock(&tree->lock);
>> @@ -807,8 +808,6 @@ static void zswap_frontswap_invalidate_area(unsigned type)
>> zswap_free_entry(tree, entry);
>> tree->rbroot = RB_ROOT;
>> spin_unlock(&tree->lock);
>> -
>> - zbud_destroy_pool(tree->pool);
>> kfree(tree);
>> zswap_trees[type] = NULL;
>> }
>> @@ -822,20 +821,14 @@ static void zswap_frontswap_init(unsigned type)
>> struct zswap_tree *tree;
>>
>> tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
>> - if (!tree)
>> - goto err;
>> - tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
>> - if (!tree->pool)
>> - goto freetree;
>> + if (!tree) {
>> + pr_err("alloc failed, zswap disabled for swap type %d\n", type);
>> + return;
>> + }
>> +
>> tree->rbroot = RB_ROOT;
>> spin_lock_init(&tree->lock);
>> zswap_trees[type] = tree;
>> - return;
>> -
>> -freetree:
>> - kfree(tree);
>> -err:
>> - pr_err("alloc failed, zswap disabled for swap type %d\n", type);
>> }
>>
>> static struct frontswap_ops zswap_frontswap_ops = {
>> @@ -907,9 +900,14 @@ static int __init init_zswap(void)
>> return 0;
>>
>> pr_info("loading zswap\n");
>> +
>> + mem_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
>> + if (!mem_pool)
>> + goto error;
>> +
>> if (zswap_entry_cache_create()) {
>> pr_err("entry cache creation failed\n");
>> - goto error;
>> + goto cachefail;
>> }
>> if (zswap_comp_init()) {
>> pr_err("compressor initialization failed\n");
>> @@ -919,6 +917,8 @@ static int __init init_zswap(void)
>> pr_err("per-cpu initialization failed\n");
>> goto pcpufail;
>> }
>> +
>> +
>> frontswap_register_ops(&zswap_frontswap_ops);
>> if (zswap_debugfs_init())
>> pr_warn("debugfs initialization failed\n");
>> @@ -927,6 +927,8 @@ pcpufail:
>> zswap_comp_exit();
>> compfail:
>> zswap_entry_cache_destory();
>> +cachefail:
>> + zbud_destroy_pool(mem_pool);
>> error:
>> return -ENOMEM;
>> }
>> --
>> 1.8.5.2
>>
>>
>> --
>> Kind regards,
>> Minchan Kim

2014-01-27 02:20:02

by Minchan Kim

[permalink] [raw]
Subject: Re: [PATCH v2] mm/zswap: Check all pool pages instead of one pool pages

Hello Weigie,

On Fri, Jan 24, 2014 at 10:20:36PM +0800, Weijie Yang wrote:
> On Thu, Jan 23, 2014 at 2:30 PM, Cai Liu <[email protected]> wrote:
> > Hello Minchan
> >
> > 2014/1/23 Minchan Kim <[email protected]>:
> >> Hello Cai,
> >>
> >> On Thu, Jan 23, 2014 at 09:38:41AM +0800, Cai Liu wrote:
> >>> Hello Dan
> >>>
> >>> 2014/1/22 Dan Streetman <[email protected]>:
> >>> > On Wed, Jan 22, 2014 at 7:16 AM, Cai Liu <[email protected]> wrote:
> >>> >> Hello Minchan
> >>> >>
> >>> >>
> >>> >> 2014/1/22 Minchan Kim <[email protected]>
> >>> >>>
> >>> >>> Hello Cai,
> >>> >>>
> >>> >>> On Tue, Jan 21, 2014 at 09:52:25PM +0800, Cai Liu wrote:
> >>> >>> > Hello Minchan
> >>> >>> >
> >>> >>> > 2014/1/21 Minchan Kim <[email protected]>:
> >>> >>> > > Hello,
> >>> >>> > >
> >>> >>> > > On Tue, Jan 21, 2014 at 02:35:07PM +0800, Cai Liu wrote:
> >>> >>> > >> 2014/1/21 Minchan Kim <[email protected]>:
> >>> >>> > >> > Please check your MUA and don't break thread.
> >>> >>> > >> >
> >>> >>> > >> > On Tue, Jan 21, 2014 at 11:07:42AM +0800, Cai Liu wrote:
> >>> >>> > >> >> Thanks for your review.
> >>> >>> > >> >>
> >>> >>> > >> >> 2014/1/21 Minchan Kim <[email protected]>:
> >>> >>> > >> >> > Hello Cai,
> >>> >>> > >> >> >
> >>> >>> > >> >> > On Mon, Jan 20, 2014 at 03:50:18PM +0800, Cai Liu wrote:
> >>> >>> > >> >> >> zswap can support multiple swapfiles. So we need to check
> >>> >>> > >> >> >> all zbud pool pages in zswap.
> >>> >>> > >> >> >>
> >>> >>> > >> >> >> Version 2:
> >>> >>> > >> >> >> * add *total_zbud_pages* in zbud to record all the pages in pools
> >>> >>> > >> >> >> * move the updating of pool pages statistics to
> >>> >>> > >> >> >> alloc_zbud_page/free_zbud_page to hide the details
> >>> >>> > >> >> >>
> >>> >>> > >> >> >> Signed-off-by: Cai Liu <[email protected]>
> >>> >>> > >> >> >> ---
> >>> >>> > >> >> >> include/linux/zbud.h | 2 +-
> >>> >>> > >> >> >> mm/zbud.c | 44 ++++++++++++++++++++++++++++++++------------
> >>> >>> > >> >> >> mm/zswap.c | 4 ++--
> >>> >>> > >> >> >> 3 files changed, 35 insertions(+), 15 deletions(-)
> >>> >>> > >> >> >>
> >>> >>> > >> >> >> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
> >>> >>> > >> >> >> index 2571a5c..1dbc13e 100644
> >>> >>> > >> >> >> --- a/include/linux/zbud.h
> >>> >>> > >> >> >> +++ b/include/linux/zbud.h
> >>> >>> > >> >> >> @@ -17,6 +17,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle);
> >>> >>> > >> >> >> int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
> >>> >>> > >> >> >> void *zbud_map(struct zbud_pool *pool, unsigned long handle);
> >>> >>> > >> >> >> void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
> >>> >>> > >> >> >> -u64 zbud_get_pool_size(struct zbud_pool *pool);
> >>> >>> > >> >> >> +u64 zbud_get_pool_size(void);
> >>> >>> > >> >> >>
> >>> >>> > >> >> >> #endif /* _ZBUD_H_ */
> >>> >>> > >> >> >> diff --git a/mm/zbud.c b/mm/zbud.c
> >>> >>> > >> >> >> index 9451361..711aaf4 100644
> >>> >>> > >> >> >> --- a/mm/zbud.c
> >>> >>> > >> >> >> +++ b/mm/zbud.c
> >>> >>> > >> >> >> @@ -52,6 +52,13 @@
> >>> >>> > >> >> >> #include <linux/spinlock.h>
> >>> >>> > >> >> >> #include <linux/zbud.h>
> >>> >>> > >> >> >>
> >>> >>> > >> >> >> +/*********************************
> >>> >>> > >> >> >> +* statistics
> >>> >>> > >> >> >> +**********************************/
> >>> >>> > >> >> >> +
> >>> >>> > >> >> >> +/* zbud pages in all pools */
> >>> >>> > >> >> >> +static u64 total_zbud_pages;
> >>> >>> > >> >> >> +
> >>> >>> > >> >> >> /*****************
> >>> >>> > >> >> >> * Structures
> >>> >>> > >> >> >> *****************/
> >>> >>> > >> >> >> @@ -142,10 +149,28 @@ static struct zbud_header *init_zbud_page(struct page *page)
> >>> >>> > >> >> >> return zhdr;
> >>> >>> > >> >> >> }
> >>> >>> > >> >> >>
> >>> >>> > >> >> >> +static struct page *alloc_zbud_page(struct zbud_pool *pool, gfp_t gfp)
> >>> >>> > >> >> >> +{
> >>> >>> > >> >> >> + struct page *page;
> >>> >>> > >> >> >> +
> >>> >>> > >> >> >> + page = alloc_page(gfp);
> >>> >>> > >> >> >> +
> >>> >>> > >> >> >> + if (page) {
> >>> >>> > >> >> >> + pool->pages_nr++;
> >>> >>> > >> >> >> + total_zbud_pages++;
> >>> >>> > >> >> >
> >>> >>> > >> >> > Who protect race?
> >>> >>> > >> >>
> >>> >>> > >> >> Yes, here the pool->pages_nr and also the total_zbud_pages are not protected.
> >>> >>> > >> >> I will re-do it.
> >>> >>> > >> >>
> >>> >>> > >> >> I will change *total_zbud_pages* to atomic type.
> >>> >>> > >> >
> >>> >>> > >> > Wait, it doesn't make sense. Now, you assume zbud allocator would be used
> >>> >>> > >> > for only zswap. It's true until now but we couldn't make sure it in future.
> >>> >>> > >> > If other user start to use zbud allocator, total_zbud_pages would be pointless.
> >>> >>> > >>
> >>> >>> > >> Yes, you are right. ZBUD is a common module. So in this patch calculate the
> >>> >>> > >> zswap pool size in zbud is not suitable.
> >>> >>> > >>
> >>> >>> > >> >
> >>> >>> > >> > Another concern is that what's your scenario for above two swap?
> >>> >>> > >> > How often we need to call zbud_get_pool_size?
> >>> >>> > >> > In previous your patch, you reduced the number of call so IIRC,
> >>> >>> > >> > we only called it in zswap_is_full and for debugfs.
> >>> >>> > >>
> >>> >>> > >> zbud_get_pool_size() is called frequently when adding/freeing zswap
> >>> >>> > >> entry happen in zswap . This is why in this patch I added a counter in zbud,
> >>> >>> > >> and then in zswap the iteration of zswap_list to calculate the pool size will
> >>> >>> > >> not be needed.
> >>> >>> > >
> >>> >>> > > We can remove updating zswap_pool_pages in zswap_frontswap_store and
> >>> >>> > > zswap_free_entry as I said. So zswap_is_full is only hot spot.
> >>> >>> > > Do you think it's still big overhead? Why? Maybe locking to prevent
> >>> >>> > > destroying? Then, we can use RCU to minimize the overhead as I mentioned.
> >>> >>> >
> >>> >>> > I get your point. Yes, In my previous patch, zswap_is_full() was the
> >>> >>> > only path to call
> >>> >>> > zbud_get_pool_size(). And your suggestion on patch v1 to remove the unnecessary
> >>> >>> > iteration will reduce the overhead further.
> >>> >>> >
> >>> >>> > So adding the calculating of all the pool size in zswap.c is better.
> >>> >>> >
> >>> >>> > >>
> >>> >>> > >> > Of course, it would need some lock or refcount to prevent destroy
> >>> >>> > >> > of zswap_tree in parallel with zswap_frontswap_invalidate_area but
> >>> >>> > >> > zswap_is_full doesn't need to be exact so RCU would be good fit.
> >>> >>> > >> >
> >>> >>> > >> > Most important point is that now zswap doesn't consider multiple swap.
> >>> >>> > >> > For example, Let's assume you uses two swap A and B with different priority
> >>> >>> > >> > and A already has charged 19% long time ago and let's assume that A swap is
> >>> >>> > >> > full now so VM start to use B so that B has charged 1% recently.
> >>> >>> > >> > It menas zswap charged (19% + 1%)i is full by default.
> >>> >>> > >> >
> >>> >>> > >> > Then, if VM want to swap out more pages into B, zbud_reclaim_page
> >>> >>> > >> > would be evict one of pages in B's pool and it would be repeated
> >>> >>> > >> > continuously. It's totally LRU reverse problem and swap thrashing in B
> >>> >>> > >> > would happen.
> >>> >>> > >> >
> >>> >>> > >>
> >>> >>> > >> The scenario is below:
> >>> >>> > >> There are 2 swap A, B in system. If pool size of A reach 19% of ram
> >>> >>> > >> size and swap A
> >>> >>> > >> is also full. Then swap B will be used. Pool size of B will be
> >>> >>> > >> increased until it hit
> >>> >>> > >> the 20% of the ram size. By now zswap pool size is about 39% of ram size.
> >>> >>> > >> If there are more than 2 swap file/device, zswap pool will expand out
> >>> >>> > >> of control
> >>> >>> > >> and there may be no swapout happened.
> >>> >>> > >
> >>> >>> > > I know.
> >>> >>> > >
> >>> >>> > >>
> >>> >>> > >> I think the original intention of zswap designer is to keep the total
> >>> >>> > >> zswap pools size below
> >>> >>> > >> 20% of RAM size.
> >>> >>> > >
> >>> >>> > > My point is your patch still doesn't solve the example I mentioned.
> >>> >>> >
> >>> >>> > Hmm. My patch only make sure all the zswap pools use maximum 20% of
> >>> >>> > RAM size. It is a new problem in your example. The zbud_reclaim_page would
> >>> >>> > not swap out the oldest zbud page when multiple swaps are used.
> >>> >>> >
> >>> >>> > Maybe the new problem can be resolved in another patch.
> >>> >>>
> >>> >>> It means current zswap has a problem in multiple swap but you want
> >>> >>> to fix a problem which happens only when it is used for multiple swap.
> >>> >>> So, I'm not sure we want a fix in this phase before discussing more
> >>> >>> fundamental thing.
> >>> >>>
> >>> >>
> >>> >> Yes, The bug which I want to fix only happens when multiple swap are used.
> >>> >>
> >>> >>> That's why I want to know why you want to use multiple swap with zswap
> >>> >>> but you are never saying it to us. :(
> >>> >>>
> >>> >>
> >>> >> If user uses more than one swap device/file, then this is an issue.
> >>> >> Zswap pool is created when a swap device/file is swapped on happens.
> >>> >> So there will be more than one zswap pool when user uses 2 or even
> >>> >> more swap devices/files.
> >>> >>
> >>> >> I am not sure whether multiple swap are popular. But if multiple swap
> >>> >> are swapped
> >>> >> on, then multiple zswap pool will be created. And the size of these pools may
> >>> >> out of control.
> >>> >
> >>> > Personally I don't think using multiple swap partitions/files has to
> >>> > be popular to need to solve this, it only needs to be possible, which
> >>> > it is.
> >>> >
> >>> > Why not just leave zbud unchanged, and sum up the total size using a
> >>> > list of active zswap_trees as Minchan suggested for the v1 patch? The
> >>>
> >>> Yes. This is what I want to do in the v3 patch after this bug is considered need
> >>> to be fixed.
> >>
> >> In my position, I'd like to fix zswap and multiple swap problem firstly
> >> and like the Weijie's suggestion.
> >>
> >> So, how about this?
> >> I didn't look at code in detail and want to show the concept.
> >
> > I read the RFC patch. I think it's perfect.
> >
> >> That's why I added RFC tag.
> >>
> >> From 67c64746e977a091ee30ca37bbc034990adf5ca5 Mon Sep 17 00:00:00 2001
> >> From: Minchan Kim <[email protected]>
> >> Date: Thu, 23 Jan 2014 11:41:44 +0900
> >> Subject: [RFC] zswap: support multiple swap
> >>
> >> Cai Liu reporeted that now zbud pool pages counting has a problem
> >> when multiple swap is used because it just counts one of swap
> >> among mutliple swap intead of all of swap so zswap cannot control
> >> writeback properly. The result is unnecessary writeback or
> >> no writeback when we should really writeback. IOW, it made zswap
> >> crazy.
> >>
> >> Another problem in zswap is following as.
> >> For example, let's assume we use two swap A and B with different
> >> priority and A already has charged 19% long time ago and let's assume
> >> that A swap is full now so VM start to use B so that B has charged 1%
> >> recently. It menas zswap charged (19% + 1%) is full by default.
> >> Then, if VM want to swap out more pages into B, zbud_reclaim_page
> >> would be evict one of pages in B's pool and it would be repeated
> >> continuously. It's totally LRU reverse problem and swap thrashing
> >> in B would happen.
> >>
> >> This patch makes zswap consider mutliple swap by creating *a* zbud
> >> pool which will be shared by multiple swap so all of zswap pages
> >> in multiple swap keep order by LRU so it can prevent above two
> >> problems.
> >>
> >> Reported-by: Cai Liu <[email protected]>
> >> Suggested-by: Weijie Yang <[email protected]>
> >> Signed-off-by: Minchan Kim <[email protected]>
> >> ---
> >> mm/zswap.c | 56 +++++++++++++++++++++++++++++---------------------------
> >> 1 file changed, 29 insertions(+), 27 deletions(-)
>
> Hi, Minchan
>
> I reviewed this patch, it is good to me. Just have a little nitpick, see below.
>
> Regards
>
> >> diff --git a/mm/zswap.c b/mm/zswap.c
> >> index 5a63f78a5601..96039e86db79 100644
> >> --- a/mm/zswap.c
> >> +++ b/mm/zswap.c
> >> @@ -89,6 +89,8 @@ static unsigned int zswap_max_pool_percent = 20;
> >> module_param_named(max_pool_percent,
> >> zswap_max_pool_percent, uint, 0644);
> >>
> >> +static struct zbud_pool *mem_pool;
> >> +
>
> nitpick1: I'd like to put the same logical code together.
> such as put this mem_pool definition with zswap_trees and zswap_entry_cache
> Just my oddity, of course you can ignore it.

I'd like to stress "the mem_pool is shared by several zswap_tress" rather than
adding it in zswp_tree so

static struct zbud_pool *shared_mem_pool;


>
> >> /*********************************
> >> * compression functions
> >> **********************************/
> >> @@ -189,7 +191,6 @@ struct zswap_header {
> >> struct zswap_tree {
> >> struct rb_root rbroot;
> >> spinlock_t lock;
> >> - struct zbud_pool *pool;
> >> };
> >>
> >> static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
> >> @@ -288,10 +289,10 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
> >> static void zswap_free_entry(struct zswap_tree *tree,
> >> struct zswap_entry *entry)
>
> nitpick2: How about remove the tree parameter in zswap_free_entry?

Nice catch! I will fix it and send the patch when merge window is closed.
Thanks for the review, Weijie!

>
> >> {
> >> - zbud_free(tree->pool, entry->handle);
> >> + zbud_free(mem_pool, entry->handle);
> >> zswap_entry_cache_free(entry);
> >> atomic_dec(&zswap_stored_pages);
> >> - zswap_pool_pages = zbud_get_pool_size(tree->pool);
> >> + zswap_pool_pages = zbud_get_pool_size(mem_pool);
> >> }
> >>

--
Kind regards,
Minchan Kim