The per-cpu slab is designed to be the primary path for allocation in SLUB
since it assumed allocations will go through the fast path if possible.
When debugging is enabled, the fast path is disabled and per-cpu
allocations are not used. The current debugging code path still activates
the cpu slab for allocations and then immediately deactivates it. This
is useless work. When a slab is enabled for debugging, skip cpu
activation.
Signed-off-by: Laura Abbott <[email protected]>
---
This is a follow on to the optimization of the debug paths for poisoning
With this I get ~2 second drop on hackbench -g 20 -l 1000 with slub_debug=P
and no noticable change with slub_debug=- .
---
mm/slub.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 77 insertions(+), 5 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c
index 7277413..4507bd8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1482,8 +1482,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
}
page->freelist = fixup_red_left(s, start);
- page->inuse = page->objects;
- page->frozen = 1;
+ page->inuse = kmem_cache_debug(s) ? 1 : page->objects;
+ page->frozen = kmem_cache_debug(s) ? 0 : 1;
out:
if (gfpflags_allow_blocking(flags))
@@ -1658,6 +1658,64 @@ static inline void *acquire_slab(struct kmem_cache *s,
return freelist;
}
+
+static inline void *acquire_slab_debug(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct page *page,
+ int mode, int *objects)
+{
+ void *freelist;
+ unsigned long counters;
+ struct page new;
+ void *next;
+
+ lockdep_assert_held(&n->list_lock);
+
+
+ /*
+ * Zap the freelist and set the frozen bit.
+ * The old freelist is the list of objects for the
+ * per cpu allocation list.
+ */
+ freelist = page->freelist;
+ counters = page->counters;
+
+ BUG_ON(!freelist);
+
+ next = get_freepointer_safe(s, freelist);
+
+ new.counters = counters;
+ *objects = new.objects - new.inuse;
+ if (mode) {
+ new.inuse++;
+ new.freelist = next;
+ } else {
+ BUG();
+ }
+
+ VM_BUG_ON(new.frozen);
+
+ if (!new.freelist) {
+ remove_partial(n, page);
+ add_full(s, n, page);
+ }
+
+ if (!__cmpxchg_double_slab(s, page,
+ freelist, counters,
+ new.freelist, new.counters,
+ "acquire_slab")) {
+ if (!new.freelist) {
+ remove_full(s, n, page);
+ add_partial(n, page, DEACTIVATE_TO_HEAD);
+ }
+ return NULL;
+ }
+
+ WARN_ON(!freelist);
+ return freelist;
+}
+
+
+
static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
@@ -1688,7 +1746,11 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
if (!pfmemalloc_match(page, flags))
continue;
- t = acquire_slab(s, n, page, object == NULL, &objects);
+ if (kmem_cache_debug(s))
+ t = acquire_slab_debug(s, n, page, object == NULL, &objects);
+ else
+ t = acquire_slab(s, n, page, object == NULL, &objects);
+
if (!t)
break;
@@ -2284,7 +2346,17 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
* muck around with it freely without cmpxchg
*/
freelist = page->freelist;
- page->freelist = NULL;
+ page->freelist = kmem_cache_debug(s) ?
+ get_freepointer(s, freelist) : NULL;
+
+ if (kmem_cache_debug(s)) {
+ struct kmem_cache_node *n;
+
+ n = get_node(s, page_to_nid(page));
+ spin_lock(&n->list_lock);
+ add_partial(n, page, DEACTIVATE_TO_HEAD);
+ spin_unlock(&n->list_lock);
+ }
stat(s, ALLOC_SLAB);
c->page = page;
@@ -2446,7 +2518,7 @@ new_slab:
!alloc_debug_processing(s, page, freelist, addr))
goto new_slab; /* Slab failed checks. Next slab needed */
- deactivate_slab(s, page, get_freepointer(s, freelist));
+ /* No need to deactivate, no cpu slab */
c->page = NULL;
c->freelist = NULL;
return freelist;
--
2.5.5
On 03/28/2016 03:53 PM, Laura Abbott wrote:
> The per-cpu slab is designed to be the primary path for allocation in SLUB
> since it assumed allocations will go through the fast path if possible.
> When debugging is enabled, the fast path is disabled and per-cpu
> allocations are not used. The current debugging code path still activates
> the cpu slab for allocations and then immediately deactivates it. This
> is useless work. When a slab is enabled for debugging, skip cpu
> activation.
>
> Signed-off-by: Laura Abbott <[email protected]>
> ---
> This is a follow on to the optimization of the debug paths for poisoning
> With this I get ~2 second drop on hackbench -g 20 -l 1000 with slub_debug=P
> and no noticable change with slub_debug=- .
zero day robot pointed out this is triggering one of the BUG_ON on bootup.
I'll take a deeper look tomorrow unless the approach is actually worthless.
> ---
> mm/slub.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 77 insertions(+), 5 deletions(-)
>
> diff --git a/mm/slub.c b/mm/slub.c
> index 7277413..4507bd8 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1482,8 +1482,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> }
>
> page->freelist = fixup_red_left(s, start);
> - page->inuse = page->objects;
> - page->frozen = 1;
> + page->inuse = kmem_cache_debug(s) ? 1 : page->objects;
> + page->frozen = kmem_cache_debug(s) ? 0 : 1;
>
> out:
> if (gfpflags_allow_blocking(flags))
> @@ -1658,6 +1658,64 @@ static inline void *acquire_slab(struct kmem_cache *s,
> return freelist;
> }
>
> +
> +static inline void *acquire_slab_debug(struct kmem_cache *s,
> + struct kmem_cache_node *n, struct page *page,
> + int mode, int *objects)
> +{
> + void *freelist;
> + unsigned long counters;
> + struct page new;
> + void *next;
> +
> + lockdep_assert_held(&n->list_lock);
> +
> +
> + /*
> + * Zap the freelist and set the frozen bit.
> + * The old freelist is the list of objects for the
> + * per cpu allocation list.
> + */
> + freelist = page->freelist;
> + counters = page->counters;
> +
> + BUG_ON(!freelist);
> +
> + next = get_freepointer_safe(s, freelist);
> +
> + new.counters = counters;
> + *objects = new.objects - new.inuse;
> + if (mode) {
> + new.inuse++;
> + new.freelist = next;
> + } else {
> + BUG();
> + }
> +
> + VM_BUG_ON(new.frozen);
> +
> + if (!new.freelist) {
> + remove_partial(n, page);
> + add_full(s, n, page);
> + }
> +
> + if (!__cmpxchg_double_slab(s, page,
> + freelist, counters,
> + new.freelist, new.counters,
> + "acquire_slab")) {
> + if (!new.freelist) {
> + remove_full(s, n, page);
> + add_partial(n, page, DEACTIVATE_TO_HEAD);
> + }
> + return NULL;
> + }
> +
> + WARN_ON(!freelist);
> + return freelist;
> +}
> +
> +
> +
> static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
> static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
>
> @@ -1688,7 +1746,11 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
> if (!pfmemalloc_match(page, flags))
> continue;
>
> - t = acquire_slab(s, n, page, object == NULL, &objects);
> + if (kmem_cache_debug(s))
> + t = acquire_slab_debug(s, n, page, object == NULL, &objects);
> + else
> + t = acquire_slab(s, n, page, object == NULL, &objects);
> +
> if (!t)
> break;
>
> @@ -2284,7 +2346,17 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
> * muck around with it freely without cmpxchg
> */
> freelist = page->freelist;
> - page->freelist = NULL;
> + page->freelist = kmem_cache_debug(s) ?
> + get_freepointer(s, freelist) : NULL;
> +
> + if (kmem_cache_debug(s)) {
> + struct kmem_cache_node *n;
> +
> + n = get_node(s, page_to_nid(page));
> + spin_lock(&n->list_lock);
> + add_partial(n, page, DEACTIVATE_TO_HEAD);
> + spin_unlock(&n->list_lock);
> + }
>
> stat(s, ALLOC_SLAB);
> c->page = page;
> @@ -2446,7 +2518,7 @@ new_slab:
> !alloc_debug_processing(s, page, freelist, addr))
> goto new_slab; /* Slab failed checks. Next slab needed */
>
> - deactivate_slab(s, page, get_freepointer(s, freelist));
> + /* No need to deactivate, no cpu slab */
> c->page = NULL;
> c->freelist = NULL;
> return freelist;
>
On 03/28/2016 06:52 PM, Laura Abbott wrote:
> On 03/28/2016 03:53 PM, Laura Abbott wrote:
>> The per-cpu slab is designed to be the primary path for allocation in SLUB
>> since it assumed allocations will go through the fast path if possible.
>> When debugging is enabled, the fast path is disabled and per-cpu
>> allocations are not used. The current debugging code path still activates
>> the cpu slab for allocations and then immediately deactivates it. This
>> is useless work. When a slab is enabled for debugging, skip cpu
>> activation.
>>
>> Signed-off-by: Laura Abbott <[email protected]>
>> ---
>> This is a follow on to the optimization of the debug paths for poisoning
>> With this I get ~2 second drop on hackbench -g 20 -l 1000 with slub_debug=P
>> and no noticable change with slub_debug=- .
>
> zero day robot pointed out this is triggering one of the BUG_ON on bootup.
> I'll take a deeper look tomorrow unless the approach is actually worthless.
>> ---
>> mm/slub.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
>> 1 file changed, 77 insertions(+), 5 deletions(-)
>>
>> diff --git a/mm/slub.c b/mm/slub.c
>> index 7277413..4507bd8 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -1482,8 +1482,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>> }
>>
>> page->freelist = fixup_red_left(s, start);
>> - page->inuse = page->objects;
>> - page->frozen = 1;
>> + page->inuse = kmem_cache_debug(s) ? 1 : page->objects;
>> + page->frozen = kmem_cache_debug(s) ? 0 : 1;
>>
>> out:
>> if (gfpflags_allow_blocking(flags))
>> @@ -1658,6 +1658,64 @@ static inline void *acquire_slab(struct kmem_cache *s,
>> return freelist;
>> }
>>
>> +
>> +static inline void *acquire_slab_debug(struct kmem_cache *s,
>> + struct kmem_cache_node *n, struct page *page,
>> + int mode, int *objects)
>> +{
>> + void *freelist;
>> + unsigned long counters;
>> + struct page new;
>> + void *next;
>> +
>> + lockdep_assert_held(&n->list_lock);
>> +
>> +
>> + /*
>> + * Zap the freelist and set the frozen bit.
>> + * The old freelist is the list of objects for the
>> + * per cpu allocation list.
>> + */
>> + freelist = page->freelist;
>> + counters = page->counters;
>> +
>> + BUG_ON(!freelist);
>> +
>> + next = get_freepointer_safe(s, freelist);
>> +
>> + new.counters = counters;
>> + *objects = new.objects - new.inuse;
>> + if (mode) {
>> + new.inuse++;
>> + new.freelist = next;
>> + } else {
>> + BUG();
>> + }
>> +
>> + VM_BUG_ON(new.frozen);
>> +
>> + if (!new.freelist) {
>> + remove_partial(n, page);
>> + add_full(s, n, page);
>> + }
>> +
>> + if (!__cmpxchg_double_slab(s, page,
>> + freelist, counters,
>> + new.freelist, new.counters,
>> + "acquire_slab")) {
>> + if (!new.freelist) {
>> + remove_full(s, n, page);
>> + add_partial(n, page, DEACTIVATE_TO_HEAD);
>> + }
>> + return NULL;
>> + }
>> +
>> + WARN_ON(!freelist);
>> + return freelist;
>> +}
>> +
>> +
>> +
>> static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
>> static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
>>
>> @@ -1688,7 +1746,11 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
>> if (!pfmemalloc_match(page, flags))
>> continue;
>>
>> - t = acquire_slab(s, n, page, object == NULL, &objects);
>> + if (kmem_cache_debug(s))
>> + t = acquire_slab_debug(s, n, page, object == NULL, &objects);
>> + else
>> + t = acquire_slab(s, n, page, object == NULL, &objects);
>> +
>> if (!t)
>> break;
>>
>> @@ -2284,7 +2346,17 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
>> * muck around with it freely without cmpxchg
>> */
>> freelist = page->freelist;
>> - page->freelist = NULL;
>> + page->freelist = kmem_cache_debug(s) ?
>> + get_freepointer(s, freelist) : NULL;
>> +
>> + if (kmem_cache_debug(s)) {
>> + struct kmem_cache_node *n;
>> +
>> + n = get_node(s, page_to_nid(page));
>> + spin_lock(&n->list_lock);
>> + add_partial(n, page, DEACTIVATE_TO_HEAD);
>> + spin_unlock(&n->list_lock);
>> + }
This needs to account for slabs full after one object, otherwise it bugs out on the
partial list.
>>
>> stat(s, ALLOC_SLAB);
>> c->page = page;
>> @@ -2446,7 +2518,7 @@ new_slab:
>> !alloc_debug_processing(s, page, freelist, addr))
>> goto new_slab; /* Slab failed checks. Next slab needed */
>>
>> - deactivate_slab(s, page, get_freepointer(s, freelist));
>> + /* No need to deactivate, no cpu slab */
>> c->page = NULL;
>> c->freelist = NULL;
>> return freelist;
>>
>
On Mon, Mar 28, 2016 at 03:53:01PM -0700, Laura Abbott wrote:
> The per-cpu slab is designed to be the primary path for allocation in SLUB
> since it assumed allocations will go through the fast path if possible.
> When debugging is enabled, the fast path is disabled and per-cpu
> allocations are not used. The current debugging code path still activates
> the cpu slab for allocations and then immediately deactivates it. This
> is useless work. When a slab is enabled for debugging, skip cpu
> activation.
>
> Signed-off-by: Laura Abbott <[email protected]>
> ---
> This is a follow on to the optimization of the debug paths for poisoning
> With this I get ~2 second drop on hackbench -g 20 -l 1000 with slub_debug=P
> and no noticable change with slub_debug=- .
I'd like to know the performance difference between slub_debug=P and
slub_debug=- with this change.
Although this patch increases hackbench performance, I'm not sure it's
sufficient for the production system. Concurrent slab allocation request
will contend the node lock in every allocation attempt. So, there would be
other ues-cases that performance drop due to slub_debug=P cannot be
accepted even if it is security feature.
How about allowing cpu partial list for debug cases?
It will not hurt fast path and will make less contention on the node
lock.
Thanks.
On 03/31/2016 07:35 PM, Joonsoo Kim wrote:
> On Mon, Mar 28, 2016 at 03:53:01PM -0700, Laura Abbott wrote:
>> The per-cpu slab is designed to be the primary path for allocation in SLUB
>> since it assumed allocations will go through the fast path if possible.
>> When debugging is enabled, the fast path is disabled and per-cpu
>> allocations are not used. The current debugging code path still activates
>> the cpu slab for allocations and then immediately deactivates it. This
>> is useless work. When a slab is enabled for debugging, skip cpu
>> activation.
>>
>> Signed-off-by: Laura Abbott <[email protected]>
>> ---
>> This is a follow on to the optimization of the debug paths for poisoning
>> With this I get ~2 second drop on hackbench -g 20 -l 1000 with slub_debug=P
>> and no noticable change with slub_debug=- .
>
> I'd like to know the performance difference between slub_debug=P and
> slub_debug=- with this change.
>
with the hackbench benchmark
slub_debug=- 6.834
slub_debug=P 8.059
so ~1.2 second difference.
> Although this patch increases hackbench performance, I'm not sure it's
> sufficient for the production system. Concurrent slab allocation request
> will contend the node lock in every allocation attempt. So, there would be
> other ues-cases that performance drop due to slub_debug=P cannot be
> accepted even if it is security feature.
>
Hmmm, I hadn't considered that :-/
> How about allowing cpu partial list for debug cases?
> It will not hurt fast path and will make less contention on the node
> lock.
>
That helps more than this patch! It brings slub_debug=P down to 7.535
with the same relaxing of restrictions of CMPXCHG (allow the partials
with poison or redzoning, restrict otherwise).
It still seems unfortunate that deactive_slab takes up so much time
of __slab_alloc. I'll give some more thought about trying to skip
the CPU slab activation with the cpu partial list.
> Thanks.
>
Thanks,
Laura
On 04/01/2016 03:15 PM, Laura Abbott wrote:
> On 03/31/2016 07:35 PM, Joonsoo Kim wrote:
>> On Mon, Mar 28, 2016 at 03:53:01PM -0700, Laura Abbott wrote:
>>> The per-cpu slab is designed to be the primary path for allocation in SLUB
>>> since it assumed allocations will go through the fast path if possible.
>>> When debugging is enabled, the fast path is disabled and per-cpu
>>> allocations are not used. The current debugging code path still activates
>>> the cpu slab for allocations and then immediately deactivates it. This
>>> is useless work. When a slab is enabled for debugging, skip cpu
>>> activation.
>>>
>>> Signed-off-by: Laura Abbott <[email protected]>
>>> ---
>>> This is a follow on to the optimization of the debug paths for poisoning
>>> With this I get ~2 second drop on hackbench -g 20 -l 1000 with slub_debug=P
>>> and no noticable change with slub_debug=- .
>>
>> I'd like to know the performance difference between slub_debug=P and
>> slub_debug=- with this change.
>>
>
> with the hackbench benchmark
>
> slub_debug=- 6.834
>
> slub_debug=P 8.059
>
>
> so ~1.2 second difference.
>
>> Although this patch increases hackbench performance, I'm not sure it's
>> sufficient for the production system. Concurrent slab allocation request
>> will contend the node lock in every allocation attempt. So, there would be
>> other ues-cases that performance drop due to slub_debug=P cannot be
>> accepted even if it is security feature.
>>
>
> Hmmm, I hadn't considered that :-/
>
>> How about allowing cpu partial list for debug cases?
>> It will not hurt fast path and will make less contention on the node
>> lock.
>>
>
> That helps more than this patch! It brings slub_debug=P down to 7.535
> with the same relaxing of restrictions of CMPXCHG (allow the partials
> with poison or redzoning, restrict otherwise).
>
> It still seems unfortunate that deactive_slab takes up so much time
> of __slab_alloc. I'll give some more thought about trying to skip
> the CPU slab activation with the cpu partial list.
>
I realized I was too eager about the number there. That number includes
using the slow path since the CPU partial list activates the fast path.
I'll need to think about how to use the CPU partial list and still
force debugging on the slow path.
Thanks,
Laura