Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20;
Date:   Sat, 5 Mar 2022 05:10:25 +0000
From:   Hyeonggon Yoo <42.hyeyoo@gmail.com>
To:     Vlastimil Babka <vbabka@suse.cz>
Cc:     linux-mm@kvack.org, Christoph Lameter <cl@linux.com>,
        Pekka Enberg <penberg@kernel.org>,
        David Rientjes <rientjes@google.com>,
        Joonsoo Kim <iamjoonsoo.kim@lge.com>,
        Andrew Morton <akpm@linux-foundation.org>,
        Marco Elver <elver@google.com>,
        Matthew WilCox <willy@infradead.org>,
        Roman Gushchin <roman.gushchin@linux.dev>,
        linux-kernel@vger.kernel.org
Subject: Re: [PATCH v2 1/5] mm/slab: kmalloc: pass requests larger than
 order-1 page to page allocator
Message-ID: <YiLwwa8YUU6B8ID5@ip-172-31-19-208.ap-northeast-1.compute.internal>
References: <20220304063427.372145-1-42.hyeyoo@gmail.com>
 <20220304063427.372145-2-42.hyeyoo@gmail.com>
 <213a7412-5439-2e8a-986f-6836f3976432@suse.cz>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <213a7412-5439-2e8a-986f-6836f3976432@suse.cz>
Precedence: bulk

On Fri, Mar 04, 2022 at 01:45:16PM +0100, Vlastimil Babka wrote:
> On 3/4/22 07:34, Hyeonggon Yoo wrote:
> > There is not much benefit for serving large objects in kmalloc().
> > Let's pass large requests to page allocator like SLUB for better
> > maintenance of common code.
> > 
> > Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
> 
> Looks like it's on the right path! Some suggestions follow.
> 

Thank you for reviewing this!
You have really good eye.

> > ---
> >  include/linux/slab.h | 35 ++++++++++++++++-------------------
> >  mm/slab.c            | 31 +++++++++++++++++++++++++++----
> >  mm/slab.h            | 19 +++++++++++++++++++
> >  mm/slub.c            | 19 -------------------
> >  4 files changed, 62 insertions(+), 42 deletions(-)
> > 
> > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > index 37bde99b74af..e7b3330db4f3 100644
> > --- a/include/linux/slab.h
> > +++ b/include/linux/slab.h
> > @@ -224,29 +224,19 @@ void kmem_dump_obj(void *object);
> >   * Kmalloc array related definitions
> >   */
> >  
> > -#ifdef CONFIG_SLAB
> >  /*
> > - * The largest kmalloc size supported by the SLAB allocators is
> > - * 32 megabyte (2^25) or the maximum allocatable page order if that is
> > - * less than 32 MB.
> > - *
> > - * WARNING: Its not easy to increase this value since the allocators have
> > - * to do various tricks to work around compiler limitations in order to
> > - * ensure proper constant folding.
> > + * SLAB and SLUB directly allocates requests fitting in to an order-1 page
> > + * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
> >   */
> > -#define KMALLOC_SHIFT_HIGH	((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
> > -				(MAX_ORDER + PAGE_SHIFT - 1) : 25)
> > -#define KMALLOC_SHIFT_MAX	KMALLOC_SHIFT_HIGH
> > +#ifdef CONFIG_SLAB
> > +#define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
> > +#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT - 1)
> 
> I think we should be able to also remove the larger sizes from
> __kmalloc_index() later in this file.
>

Right. we can remove sizes larger than max(PAGE_SHIFT) + 1.

searching using cscope, maximum PAGE_SHIFT is 20 (1MB) in hexagon.
I think we can remove anything larger than 2MB.

But Hmm, I'll add something like static_assert(PAGE_SHIFT <= 20).
Let's see if any architecture/config complain this.

> >  #ifndef KMALLOC_SHIFT_LOW
> >  #define KMALLOC_SHIFT_LOW	5
> >  #endif
> >  #endif
> >  
> >  #ifdef CONFIG_SLUB
> > -/*
> > - * SLUB directly allocates requests fitting in to an order-1 page
> > - * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
> > - */
> >  #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
> >  #define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT - 1)
> >  #ifndef KMALLOC_SHIFT_LOW
> > @@ -564,15 +554,15 @@ static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t fl
> >   *	Try really hard to succeed the allocation but fail
> >   *	eventually.
> >   */
> > +#ifndef CONFIG_SLOB
> >  static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
> >  {
> >  	if (__builtin_constant_p(size)) {
> > -#ifndef CONFIG_SLOB
> >  		unsigned int index;
> > -#endif
> > +
> >  		if (size > KMALLOC_MAX_CACHE_SIZE)
> >  			return kmalloc_large(size, flags);
> > -#ifndef CONFIG_SLOB
> > +
> >  		index = kmalloc_index(size);
> >  
> >  		if (!index)
> > @@ -581,10 +571,17 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
> >  		return kmem_cache_alloc_trace(
> >  				kmalloc_caches[kmalloc_type(flags)][index],
> >  				flags, size);
> > -#endif
> >  	}
> >  	return __kmalloc(size, flags);
> >  }
> > +#else
> > +static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
> > +{
> > +	if (__builtin_constant_p(size) && size > KMALLOC_MAX_CACHE_SIZE)
> > +		return kmalloc_large(size, flags);
> > +	return __kmalloc(size, flags);
> > +}
> > +#endif
> >  
> >  static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
> >  {
> > diff --git a/mm/slab.c b/mm/slab.c
> > index ddf5737c63d9..570af6dc3478 100644
> > --- a/mm/slab.c
> > +++ b/mm/slab.c
> > @@ -3624,7 +3624,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
> >  	void *ret;
> >  
> >  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
> > -		return NULL;
> > +		return kmalloc_large(size, flags);
> 
> This loses the node information and NUMA locality.

Oh, my fault. Thank you for catching this.

> It would be better to generalize kmalloc_large_node() from mm/slub.c instead.

Sounds nice.

> > +
> >  	cachep = kmalloc_slab(size, flags);
> >  	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
> >  		return cachep;
> > @@ -3685,7 +3686,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
> >  	void *ret;
> >  
> >  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
> > -		return NULL;
> > +		return kmalloc_large(size, flags);
> > +
> >  	cachep = kmalloc_slab(size, flags);
> >  	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
> >  		return cachep;
> > @@ -3739,14 +3741,21 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
> >  {
> >  	struct kmem_cache *s;
> >  	size_t i;
> > +	struct folio *folio;
> >  
> >  	local_irq_disable();
> >  	for (i = 0; i < size; i++) {
> >  		void *objp = p[i];
> >  
> > -		if (!orig_s) /* called via kfree_bulk */
> > +		if (!orig_s) {
> > +			/* called via kfree_bulk */
> > +			folio = virt_to_folio(objp);
> > +			if (unlikely(!folio_test_slab(folio))) {
> > +				free_large_kmalloc(folio, objp);
> 
> Hmm I'm not sure it's a good idea to call this and the page allocator with
> disabled irq's. Maybe simply re-enable around it?

Seems reasonable. re-enabling around here will be sufficient for this
function.

By The Way, it seems SLAB really does not care about disabling irq
even if it's unnecessary.

I would like to try reducing irq-disabled area in SLAB and how it
affects on macro/micro benchmarks after this series is done :)

> > +				continue;
> > +			}
> >  			s = virt_to_cache(objp);
> 
> Since we already have the folio, we shoold instead use folio_slab and
> slab->slab_cache (see how mm/slub.c does it).
> The same applies to functions below as well.

Right. I'll change them in v3.

> > -		else
> > +		} else
> >  			s = cache_from_obj(orig_s, objp);
> >  		if (!s)
> >  			continue;
> > @@ -3776,11 +3785,20 @@ void kfree(const void *objp)
> >  {
> >  	struct kmem_cache *c;
> >  	unsigned long flags;
> > +	struct folio *folio;
> > +	void *object = (void *) objp;
> >  
> >  	trace_kfree(_RET_IP_, objp);
> >  
> >  	if (unlikely(ZERO_OR_NULL_PTR(objp)))
> >  		return;
> > +
> > +	folio = virt_to_folio(objp);
> > +	if (unlikely(!folio_test_slab(folio))) {
> > +		free_large_kmalloc(folio, object);
> > +		return;
> > +	}
> > +
> >  	local_irq_save(flags);
> >  	kfree_debugcheck(objp);
> >  	c = virt_to_cache(objp);
> > @@ -4211,12 +4229,17 @@ void __check_heap_object(const void *ptr, unsigned long n,
> >  size_t __ksize(const void *objp)
> >  {
> >  	struct kmem_cache *c;
> > +	struct folio *folio;
> >  	size_t size;
> >  
> >  	BUG_ON(!objp);
> >  	if (unlikely(objp == ZERO_SIZE_PTR))
> >  		return 0;
> >  
> > +	folio = virt_to_folio(objp);
> > +	if (!folio_test_slab(folio))
> > +		return folio_size(folio);
> > +
> >  	c = virt_to_cache(objp);
> >  	size = c ? c->object_size : 0;
> >  
> > diff --git a/mm/slab.h b/mm/slab.h
> > index c7f2abc2b154..31e98beb47a3 100644
> > --- a/mm/slab.h
> > +++ b/mm/slab.h
> > @@ -664,6 +664,25 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
> >  		print_tracking(cachep, x);
> >  	return cachep;
> >  }
> > +
> > +static __always_inline void kfree_hook(void *x)
> > +{
> > +	kmemleak_free(x);
> > +	kasan_kfree_large(x);
> 
> Looks like there's only one caller of kfree_hook() so we could do that
> directly there.
>

Right.

> > +}
> > +
> > +static inline void free_large_kmalloc(struct folio *folio, void *object)
> > +{
> > +	unsigned int order = folio_order(folio);
> > +
> > +	if (WARN_ON_ONCE(order == 0))
> > +		pr_warn_once("object pointer: 0x%p\n", object);
> > +
> > +	kfree_hook(object);
> > +	mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
> > +			      -(PAGE_SIZE << order));
> > +	__free_pages(folio_page(folio, 0), order);
> > +}
> >  #endif /* CONFIG_SLOB */
> >  
> >  static inline size_t slab_ksize(const struct kmem_cache *s)
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 261474092e43..04fd084f4709 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -1686,12 +1686,6 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
> >  	return ptr;
> >  }
> >  
> > -static __always_inline void kfree_hook(void *x)
> > -{
> > -	kmemleak_free(x);
> > -	kasan_kfree_large(x);
> > -}
> > -
> >  static __always_inline bool slab_free_hook(struct kmem_cache *s,
> >  						void *x, bool init)
> >  {
> > @@ -3535,19 +3529,6 @@ struct detached_freelist {
> >  	struct kmem_cache *s;
> >  };
> >  
> > -static inline void free_large_kmalloc(struct folio *folio, void *object)
> > -{
> > -	unsigned int order = folio_order(folio);
> > -
> > -	if (WARN_ON_ONCE(order == 0))
> > -		pr_warn_once("object pointer: 0x%p\n", object);
> > -
> > -	kfree_hook(object);
> > -	mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
> > -			      -(PAGE_SIZE << order));
> > -	__free_pages(folio_page(folio, 0), order);
> > -}
> > -
> >  /*
> >   * This function progressively scans the array with free objects (with
> >   * a limited look ahead) and extract objects belonging to the same
>

Thank you so much!

-- 
Thank you, You are awesome!
Hyeonggon :-)