2021-08-07 18:55:33

by wuqiang.matt

[permalink] [raw]
Subject: [PATCH 0/2] *** kretprobe scalability improvement ***

kretprobe is using freelist to manage return instances, but freelist as
a LIFO queue based on singly linked list, scales badly and thus lowers
throughput of kretprobed routines, especially for high parallelization.
Here's a typical result (XEON 8260: 2 sockets/48 cores/96 threads):

1X 2X 4X 6X 8X 12X 16X
10880312 18121228 23214783 13155457 11190217 10991228 9623992
24X 32X 48X 64X 96X 128X 192X
8484455 8376786 6766684 5698349 4113405 4528009 4081401

This patch implements a scalabe, lock-less and numa-aware object pool
and as a result improves kretprobe to achieve near-linear scalability.
Tests of kretprobe throughput show the biggest gain as 181.5x of the
original freelist. Tge extreme tests of raw queue throughput can be up
to 282.8 of gain. The comparison results are the followings:

1X 2X 4X 8X 16X
freelist: 237911411 163596418 33048459 15506757 10640043
objpool: 234799081 294086132 585290693 1164205947 2334923746
24X 32X 48X 64X 96X
freelist: 9025299 7965531 6800225 5507639 4284752
objpool: 3508905695 1106760339 1101385147 1221763856 1211654038

The object pool is a percpu-extended version of original freelist,
with compact memory footprints and balanced performance results for
3 test caess: nonblockable retrieval (most kertprobe cases), bulk
retrieval in a row (multiple-threaded blockable kretprobe), huge
misses (preallocated objects much less than required).

wuqiang (2):
scalable lock-less object pool implementation
kretprobe: manage instances with scalable object pool

include/linux/freelist.h | 521 ++++++++++++++++++++++++++++++++++++---
include/linux/kprobes.h | 2 +-
kernel/kprobes.c | 83 ++++---
3 files changed, 536 insertions(+), 70 deletions(-)

--
2.25.1


2021-08-07 18:57:24

by wuqiang.matt

[permalink] [raw]
Subject: [PATCH 2/2] kretprobe: manage instances with scalable object pool

Use new scalable object pool to manage kretprobe instances, replacing
the previous freelist, to improve scalability and throughput under
high workloads. The original freelist, a LIFO queue based on singly
linked list, is scaling poorly and NOT amenable to parallelization.

Signed-off-by: wuqiang <[email protected]>
---
include/linux/kprobes.h | 2 +-
kernel/kprobes.c | 83 +++++++++++++++++++++--------------------
2 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 1883a4a9f16a..98b37dc01c35 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -148,6 +148,7 @@ static inline int kprobe_ftrace(struct kprobe *p)
*/
struct kretprobe_holder {
struct kretprobe *rp;
+ struct freelist_head fh;
refcount_t ref;
};

@@ -158,7 +159,6 @@ struct kretprobe {
int maxactive;
int nmissed;
size_t data_size;
- struct freelist_head freelist;
struct kretprobe_holder *rph;
};

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 745f08fdd7a6..187997640290 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1217,10 +1217,12 @@ NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
static void free_rp_inst_rcu(struct rcu_head *head)
{
struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+ struct kretprobe_holder *rph = ri->rph;

- if (refcount_dec_and_test(&ri->rph->ref))
- kfree(ri->rph);
- kfree(ri);
+ if (refcount_dec_and_test(&rph->ref)) {
+ freelist_fini(&rph->fh, NULL, NULL);
+ kfree(rph);
+ }
}
NOKPROBE_SYMBOL(free_rp_inst_rcu);

@@ -1229,9 +1231,10 @@ static void recycle_rp_inst(struct kretprobe_instance *ri)
struct kretprobe *rp = get_kretprobe(ri);

if (likely(rp)) {
- freelist_add(&ri->freelist, &rp->freelist);
- } else
+ freelist_push(&ri->freelist, &rp->rph->fh);
+ } else {
call_rcu(&ri->rcu, free_rp_inst_rcu);
+ }
}
NOKPROBE_SYMBOL(recycle_rp_inst);

@@ -1286,23 +1289,19 @@ NOKPROBE_SYMBOL(kprobe_flush_task);

static inline void free_rp_inst(struct kretprobe *rp)
{
- struct kretprobe_instance *ri;
- struct freelist_node *node;
- int count = 0;
+ struct kretprobe_holder *rph = rp->rph;
+ struct freelist_node *fn;

- node = rp->freelist.head;
- while (node) {
- ri = container_of(node, struct kretprobe_instance, freelist);
- node = node->next;
-
- kfree(ri);
- count++;
- }
-
- if (refcount_sub_and_test(count, &rp->rph->ref)) {
- kfree(rp->rph);
- rp->rph = NULL;
- }
+ rp->rph = NULL;
+ do {
+ /* must do pop() first since we have one extra ref grabbed */
+ fn = freelist_pop(&rph->fh);
+ if (refcount_dec_and_test(&rph->ref)) {
+ freelist_fini(&rph->fh, NULL, NULL);
+ kfree(rph);
+ break;
+ }
+ } while (fn);
}

/* Add the new probe to ap->list */
@@ -1928,19 +1927,18 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
- struct kretprobe_instance *ri;
struct freelist_node *fn;
+ struct kretprobe_instance *ri;

- fn = freelist_try_get(&rp->freelist);
+ fn = freelist_pop(&rp->rph->fh);
if (!fn) {
- rp->nmissed++;
+ atomic_inc((atomic_t *)&rp->nmissed);
return 0;
}
-
ri = container_of(fn, struct kretprobe_instance, freelist);

if (rp->entry_handler && rp->entry_handler(ri, regs)) {
- freelist_add(&ri->freelist, &rp->freelist);
+ freelist_push(fn, &rp->rph->fh);
return 0;
}

@@ -1986,10 +1984,19 @@ int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long o
return 0;
}

+static int kretprobe_init_inst(void *context, struct freelist_node *fn)
+{
+ struct kretprobe_instance *ri;
+
+ ri = container_of(fn, struct kretprobe_instance, freelist);
+ ri->rph = context;
+
+ return 0;
+}
+
int register_kretprobe(struct kretprobe *rp)
{
int ret;
- struct kretprobe_instance *inst;
int i;
void *addr;

@@ -2024,24 +2031,20 @@ int register_kretprobe(struct kretprobe *rp)
rp->maxactive = num_possible_cpus();
#endif
}
- rp->freelist.head = NULL;
+
rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
if (!rp->rph)
return -ENOMEM;

- rp->rph->rp = rp;
- for (i = 0; i < rp->maxactive; i++) {
- inst = kzalloc(sizeof(struct kretprobe_instance) +
- rp->data_size, GFP_KERNEL);
- if (inst == NULL) {
- refcount_set(&rp->rph->ref, i);
- free_rp_inst(rp);
- return -ENOMEM;
- }
- inst->rph = rp->rph;
- freelist_add(&inst->freelist, &rp->freelist);
+ if (freelist_init(&rp->rph->fh, rp->maxactive, rp->data_size +
+ sizeof(struct kretprobe_instance), GFP_KERNEL,
+ rp->rph, kretprobe_init_inst)) {
+ kfree(rp->rph);
+ rp->rph = NULL;
+ return -ENOMEM;
}
- refcount_set(&rp->rph->ref, i);
+ refcount_set(&rp->rph->ref, rp->maxactive + 1);
+ rp->rph->rp = rp;

rp->nmissed = 0;
/* Establish function entry probe point */
--
2.25.1

2021-08-29 09:30:47

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH 0/2] *** kretprobe scalability improvement ***

Hi,

On Sun, 8 Aug 2021 02:54:15 +0800
wuqiang <[email protected]> wrote:

> kretprobe is using freelist to manage return instances, but freelist as
> a LIFO queue based on singly linked list, scales badly and thus lowers
> throughput of kretprobed routines, especially for high parallelization.
> Here's a typical result (XEON 8260: 2 sockets/48 cores/96 threads):
>
> 1X 2X 4X 6X 8X 12X 16X
> 10880312 18121228 23214783 13155457 11190217 10991228 9623992
> 24X 32X 48X 64X 96X 128X 192X
> 8484455 8376786 6766684 5698349 4113405 4528009 4081401
>
> This patch implements a scalabe, lock-less and numa-aware object pool
> and as a result improves kretprobe to achieve near-linear scalability.
> Tests of kretprobe throughput show the biggest gain as 181.5x of the
> original freelist. Tge extreme tests of raw queue throughput can be up
> to 282.8 of gain. The comparison results are the followings:
>
> 1X 2X 4X 8X 16X
> freelist: 237911411 163596418 33048459 15506757 10640043
> objpool: 234799081 294086132 585290693 1164205947 2334923746
> 24X 32X 48X 64X 96X
> freelist: 9025299 7965531 6800225 5507639 4284752
> objpool: 3508905695 1106760339 1101385147 1221763856 1211654038
>
> The object pool is a percpu-extended version of original freelist,
> with compact memory footprints and balanced performance results for
> 3 test caess: nonblockable retrieval (most kertprobe cases), bulk
> retrieval in a row (multiple-threaded blockable kretprobe), huge
> misses (preallocated objects much less than required).

Sorry, I missed this series.
I'm OK for the code, but please combine these 2 patches into 1 because
those are not bisectable.

Thank you,

>
> wuqiang (2):
> scalable lock-less object pool implementation
> kretprobe: manage instances with scalable object pool
>
> include/linux/freelist.h | 521 ++++++++++++++++++++++++++++++++++++---
> include/linux/kprobes.h | 2 +-
> kernel/kprobes.c | 83 ++++---
> 3 files changed, 536 insertions(+), 70 deletions(-)
>
> --
> 2.25.1
>


--
Masami Hiramatsu <[email protected]>