2020-02-18 00:57:28

by Luigi Rizzo

[permalink] [raw]
Subject: [PATCH v3] kretprobe: percpu support

kretprobe uses a list protected by a single lock to allocate a
kretprobe_instance in pre_handler_kretprobe(). This works poorly with
concurrent calls.

This patch offers a simplified fix: the percpu_instance flag indicates
that we allocate one instance per CPU, and the allocation is contention
free, but we allow only have one pending entry per CPU (this could be
extended to a small constant number without much trouble).

Signed-off-by: Luigi Rizzo <[email protected]>
---
include/linux/kprobes.h | 10 ++++++++--
kernel/kprobes.c | 41 +++++++++++++++++++++++++++++++++++++++++
kernel/test_kprobes.c | 18 +++++++++++++-----
3 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 04bdaf01112cb..5c549d6a599b9 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -142,7 +142,8 @@ static inline int kprobe_ftrace(struct kprobe *p)
* can be active concurrently.
* nmissed - tracks the number of times the probed function's return was
* ignored, due to maxactive being too low.
- *
+ * percpu_instance - if set, uses one instance per cpu instead of allocating
+ * from the list protected by lock.
*/
struct kretprobe {
struct kprobe kp;
@@ -151,8 +152,13 @@ struct kretprobe {
int maxactive;
int nmissed;
size_t data_size;
- struct hlist_head free_instances;
+ union {
+ struct kretprobe_instance __percpu *pcpu;
+ struct hlist_head free_instances;
+ };
raw_spinlock_t lock;
+ u32 percpu_instance:1;
+ u32 unused:31;
};

struct kretprobe_instance {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2625c241ac00f..12ca682083ffc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1184,6 +1184,10 @@ void recycle_rp_inst(struct kretprobe_instance *ri,
hlist_del(&ri->hlist);
INIT_HLIST_NODE(&ri->hlist);
if (likely(rp)) {
+ if (rp->percpu_instance) {
+ ri->rp = NULL;
+ return;
+ }
raw_spin_lock(&rp->lock);
hlist_add_head(&ri->hlist, &rp->free_instances);
raw_spin_unlock(&rp->lock);
@@ -1274,6 +1278,11 @@ static inline void free_rp_inst(struct kretprobe *rp)
struct kretprobe_instance *ri;
struct hlist_node *next;

+ if (rp->percpu_instance) {
+ free_percpu(rp->pcpu);
+ return;
+ }
+
hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
@@ -1874,6 +1883,27 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)

/* TODO: consider to only swap the RA after the last pre_handler fired */
hash = hash_ptr(current, KPROBE_HASH_BITS);
+ if (rp->percpu_instance) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ri = this_cpu_ptr(rp->pcpu);
+ if (!ri || ri->rp) { /* already in use */
+ local_irq_restore(flags);
+ rp->nmissed++;
+ return 0;
+ }
+ INIT_HLIST_NODE(&ri->hlist);
+ ri->rp = rp;
+ ri->task = current;
+ local_irq_restore(flags);
+
+ if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ ri->rp = NULL; /* failed */
+ return 0;
+ }
+ goto good;
+ }
raw_spin_lock_irqsave(&rp->lock, flags);
if (!hlist_empty(&rp->free_instances)) {
ri = hlist_entry(rp->free_instances.first,
@@ -1891,6 +1921,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
return 0;
}

+good:
arch_prepare_kretprobe(ri, regs);

/* XXX(hch): why is there no hlist_move_head? */
@@ -1950,6 +1981,15 @@ int register_kretprobe(struct kretprobe *rp)
rp->kp.post_handler = NULL;
rp->kp.fault_handler = NULL;

+ if (rp->percpu_instance) {
+ rp->pcpu = __alloc_percpu(sizeof(*rp->pcpu) + rp->data_size,
+ __alignof__(*rp->pcpu));
+ if (rp->pcpu)
+ goto finalize;
+ free_rp_inst(rp);
+ return -ENOMEM;
+ }
+
/* Pre-allocate memory for max kretprobe instances */
if (rp->maxactive <= 0) {
#ifdef CONFIG_PREEMPTION
@@ -1971,6 +2011,7 @@ int register_kretprobe(struct kretprobe *rp)
hlist_add_head(&inst->hlist, &rp->free_instances);
}

+finalize:
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 76c997fdbc9da..22b734fb8bcb2 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -236,31 +236,36 @@ static struct kretprobe rp2 = {
.kp.symbol_name = "kprobe_target2"
};

-static int test_kretprobes(void)
+static int test_kretprobes(bool percpu_instance)
{
int ret;
struct kretprobe *rps[2] = {&rp, &rp2};
+ const char *mode = percpu_instance ? "percpu " : "normal";

/* addr and flags should be cleard for reusing kprobe. */
rp.kp.addr = NULL;
rp.kp.flags = 0;
+ rp.percpu_instance = percpu_instance;
+ rp2.kp.addr = NULL;
+ rp2.kp.flags = 0;
+ rp2.percpu_instance = percpu_instance;
ret = register_kretprobes(rps, 2);
if (ret < 0) {
- pr_err("register_kretprobe returned %d\n", ret);
+ pr_err("register_kretprobe mode %s returned %d\n", mode, ret);
return ret;
}

krph_val = 0;
ret = target(rand1);
if (krph_val != rand1) {
- pr_err("kretprobe handler not called\n");
+ pr_err("kretprobe handler mode %s not called\n", mode);
handler_errors++;
}

krph_val = 0;
ret = target2(rand1);
if (krph_val != rand1) {
- pr_err("kretprobe handler2 not called\n");
+ pr_err("kretprobe handler2 mode %s not called\n", mode);
handler_errors++;
}
unregister_kretprobes(rps, 2);
@@ -297,7 +302,10 @@ int init_test_probes(void)
errors++;

num_tests++;
- ret = test_kretprobes();
+ ret = test_kretprobes(false);
+ if (ret < 0)
+ errors++;
+ ret = test_kretprobes(true);
if (ret < 0)
errors++;
#endif /* CONFIG_KRETPROBES */
--
2.25.0.265.gbab2e86ba0-goog


2020-02-18 07:55:58

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH v3] kretprobe: percpu support

Hi Luigi,

On Mon, 17 Feb 2020 16:56:59 -0800
Luigi Rizzo <[email protected]> wrote:

> kretprobe uses a list protected by a single lock to allocate a
> kretprobe_instance in pre_handler_kretprobe(). This works poorly with
> concurrent calls.

Yes, there are several potential performance issue and the recycle
instance is one of them. However, I think this spinlock is not so racy,
but noisy (especially on many core machine) right?

Racy lock is the kretprobe_hash_lock(), I would like to replace it
with ftrace's per-task shadow stack. But that will be available
only if CONFIG_FUNCTION_GRAPH_TRACER=y (and instance has no own
payload).

> This patch offers a simplified fix: the percpu_instance flag indicates
> that we allocate one instance per CPU, and the allocation is contention
> free, but we allow only have one pending entry per CPU (this could be
> extended to a small constant number without much trouble).

OK, the percpu instance idea is good to me, and I think it should be
default option. Unless user specifies the number of instances, it should
choose percpu instance by default.
Moreover, this makes things a bit complicated, can you add per-cpu
instance array? If it is there, we can remove the old recycle rp insn
code.

>
> Signed-off-by: Luigi Rizzo <[email protected]>
> ---
> include/linux/kprobes.h | 10 ++++++++--
> kernel/kprobes.c | 41 +++++++++++++++++++++++++++++++++++++++++
> kernel/test_kprobes.c | 18 +++++++++++++-----
> 3 files changed, 62 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
> index 04bdaf01112cb..5c549d6a599b9 100644
> --- a/include/linux/kprobes.h
> +++ b/include/linux/kprobes.h
> @@ -142,7 +142,8 @@ static inline int kprobe_ftrace(struct kprobe *p)
> * can be active concurrently.
> * nmissed - tracks the number of times the probed function's return was
> * ignored, due to maxactive being too low.
> - *
> + * percpu_instance - if set, uses one instance per cpu instead of allocating
> + * from the list protected by lock.
> */
> struct kretprobe {
> struct kprobe kp;
> @@ -151,8 +152,13 @@ struct kretprobe {
> int maxactive;
> int nmissed;
> size_t data_size;
> - struct hlist_head free_instances;
> + union {
> + struct kretprobe_instance __percpu *pcpu;
> + struct hlist_head free_instances;
> + };
> raw_spinlock_t lock;
> + u32 percpu_instance:1;
> + u32 unused:31;

Please use a bool for the flag.

> };
>
> struct kretprobe_instance {
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 2625c241ac00f..12ca682083ffc 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -1184,6 +1184,10 @@ void recycle_rp_inst(struct kretprobe_instance *ri,
> hlist_del(&ri->hlist);
> INIT_HLIST_NODE(&ri->hlist);
> if (likely(rp)) {
> + if (rp->percpu_instance) {
> + ri->rp = NULL;
> + return;
> + }
> raw_spin_lock(&rp->lock);
> hlist_add_head(&ri->hlist, &rp->free_instances);
> raw_spin_unlock(&rp->lock);
> @@ -1274,6 +1278,11 @@ static inline void free_rp_inst(struct kretprobe *rp)
> struct kretprobe_instance *ri;
> struct hlist_node *next;
>
> + if (rp->percpu_instance) {
> + free_percpu(rp->pcpu);
> + return;
> + }
> +
> hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
> hlist_del(&ri->hlist);
> kfree(ri);
> @@ -1874,6 +1883,27 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
>
> /* TODO: consider to only swap the RA after the last pre_handler fired */
> hash = hash_ptr(current, KPROBE_HASH_BITS);

Hmm, this hash calculation position is also not optimized.
This should be done right before kretprobe_table_lock().

> + if (rp->percpu_instance) {
> + unsigned long flags;

We already have flags, right?

> +
> + local_irq_save(flags);
> + ri = this_cpu_ptr(rp->pcpu);
> + if (!ri || ri->rp) { /* already in use */
> + local_irq_restore(flags);
> + rp->nmissed++;
> + return 0;
> + }
> + INIT_HLIST_NODE(&ri->hlist);
> + ri->rp = rp;
> + ri->task = current;
> + local_irq_restore(flags);
> +
> + if (rp->entry_handler && rp->entry_handler(ri, regs)) {
> + ri->rp = NULL; /* failed */
> + return 0;
> + }
> + goto good;

Can't you make a new helper function for finding new free instance?
(note: don't forget to make it NOKPROBE)
What I would like to see is;

ri = get_free_rp_instance(rp); /* If !ri, increment rp->nmissed */
if (ri) {
arch_prepare_kretprobe(ri, regs);
...
}

> + }
> raw_spin_lock_irqsave(&rp->lock, flags);
> if (!hlist_empty(&rp->free_instances)) {
> ri = hlist_entry(rp->free_instances.first,
> @@ -1891,6 +1921,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
> return 0;
> }
>
> +good:
> arch_prepare_kretprobe(ri, regs);
>
> /* XXX(hch): why is there no hlist_move_head? */
> @@ -1950,6 +1981,15 @@ int register_kretprobe(struct kretprobe *rp)
> rp->kp.post_handler = NULL;
> rp->kp.fault_handler = NULL;
>
> + if (rp->percpu_instance) {
> + rp->pcpu = __alloc_percpu(sizeof(*rp->pcpu) + rp->data_size,
> + __alignof__(*rp->pcpu));
> + if (rp->pcpu)
> + goto finalize;
> + free_rp_inst(rp);
> + return -ENOMEM;
> + }
> +
> /* Pre-allocate memory for max kretprobe instances */
> if (rp->maxactive <= 0) {

Above new code should be here, and set rp->percpu_instance = true.

Thank you,

--
Masami Hiramatsu <[email protected]>

2020-02-18 09:40:13

by Luigi Rizzo

[permalink] [raw]
Subject: Re: [PATCH v3] kretprobe: percpu support

On Mon, Feb 17, 2020 at 11:55 PM Masami Hiramatsu <[email protected]> wrote:
>
> Hi Luigi,
>
> On Mon, 17 Feb 2020 16:56:59 -0800
> Luigi Rizzo <[email protected]> wrote:
>
> > kretprobe uses a list protected by a single lock to allocate a
> > kretprobe_instance in pre_handler_kretprobe(). This works poorly with
> > concurrent calls.
>
> Yes, there are several potential performance issue and the recycle
> instance is one of them. However, I think this spinlock is not so racy,
> but noisy (especially on many core machine) right?

correct, it is especially painful on 2+ sockets and many-core systems
when attaching kretprobes on otherwise uncontended paths.

>
> Racy lock is the kretprobe_hash_lock(), I would like to replace it
> with ftrace's per-task shadow stack. But that will be available
> only if CONFIG_FUNCTION_GRAPH_TRACER=y (and instance has no own
> payload).
>
> > This patch offers a simplified fix: the percpu_instance flag indicates
> > that we allocate one instance per CPU, and the allocation is contention
> > free, but we allow only have one pending entry per CPU (this could be
> > extended to a small constant number without much trouble).
>
> OK, the percpu instance idea is good to me, and I think it should be
> default option. Unless user specifies the number of instances, it should
> choose percpu instance by default.

That was my initial implementation, which would not even need the
percpu_instance
flag in struct kretprobe. However, I felt that changing the default
would have subtle
side effects (e.g., only one outstanding call per CPU) so I thought it
would be better
to leave the default unchanged and make the flag explicit.

> Moreover, this makes things a bit complicated, can you add per-cpu
> instance array? If it is there, we can remove the old recycle rp insn
> code.

Can you clarify what you mean by "per-cpu instance array" ?
Do you mean allowing multiple outstanding entries per cpu?

I will address your code comments in an updated patch.

thanks
luigi

2020-02-18 11:51:52

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH v3] kretprobe: percpu support

On Tue, 18 Feb 2020 01:39:40 -0800
Luigi Rizzo <[email protected]> wrote:

> On Mon, Feb 17, 2020 at 11:55 PM Masami Hiramatsu <[email protected]> wrote:
> >
> > Hi Luigi,
> >
> > On Mon, 17 Feb 2020 16:56:59 -0800
> > Luigi Rizzo <[email protected]> wrote:
> >
> > > kretprobe uses a list protected by a single lock to allocate a
> > > kretprobe_instance in pre_handler_kretprobe(). This works poorly with
> > > concurrent calls.
> >
> > Yes, there are several potential performance issue and the recycle
> > instance is one of them. However, I think this spinlock is not so racy,
> > but noisy (especially on many core machine) right?
>
> correct, it is especially painful on 2+ sockets and many-core systems
> when attaching kretprobes on otherwise uncontended paths.
>
> >
> > Racy lock is the kretprobe_hash_lock(), I would like to replace it
> > with ftrace's per-task shadow stack. But that will be available
> > only if CONFIG_FUNCTION_GRAPH_TRACER=y (and instance has no own
> > payload).
> >
> > > This patch offers a simplified fix: the percpu_instance flag indicates
> > > that we allocate one instance per CPU, and the allocation is contention
> > > free, but we allow only have one pending entry per CPU (this could be
> > > extended to a small constant number without much trouble).
> >
> > OK, the percpu instance idea is good to me, and I think it should be
> > default option. Unless user specifies the number of instances, it should
> > choose percpu instance by default.
>
> That was my initial implementation, which would not even need the
> percpu_instance
> flag in struct kretprobe. However, I felt that changing the default
> would have subtle
> side effects (e.g., only one outstanding call per CPU) so I thought it
> would be better
> to leave the default unchanged and make the flag explicit.
>
> > Moreover, this makes things a bit complicated, can you add per-cpu
> > instance array? If it is there, we can remove the old recycle rp insn
> > code.
>
> Can you clarify what you mean by "per-cpu instance array" ?
> Do you mean allowing multiple outstanding entries per cpu?

Yes, either allocating it on percpu area or allocating arraies
on percpu pointer is OK. e.g.

instance_size = sizeof(*rp->pcpu) + rp->data_size;
rp->pcpu = __alloc_percpu(instance_size * array_size,
__alignof__(*rp->pcpu));

And we will search free ri on the percpu array by checking ri->rp == NULL.

Thank you,

>
> I will address your code comments in an updated patch.
>
> thanks
> luigi


--
Masami Hiramatsu <[email protected]>

2020-02-18 17:52:43

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v3] kretprobe: percpu support

On Tue, 18 Feb 2020 16:55:29 +0900
Masami Hiramatsu <[email protected]> wrote:

> Racy lock is the kretprobe_hash_lock(), I would like to replace it
> with ftrace's per-task shadow stack. But that will be available
> only if CONFIG_FUNCTION_GRAPH_TRACER=y (and instance has no own
> payload).

I need to start that work up again (soon) and get that finished. :-/

-- Steve

2020-02-21 21:24:04

by Luigi Rizzo

[permalink] [raw]
Subject: Re: [PATCH v3] kretprobe: percpu support

On Tue, Feb 18, 2020 at 3:50 AM Masami Hiramatsu <[email protected]> wrote:
>
> On Tue, 18 Feb 2020 01:39:40 -0800
> Luigi Rizzo <[email protected]> wrote:
>
> > On Mon, Feb 17, 2020 at 11:55 PM Masami Hiramatsu <[email protected]> wrote:
> > >
> > > Hi Luigi,
> > >
> > > On Mon, 17 Feb 2020 16:56:59 -0800
> > > Luigi Rizzo <[email protected]> wrote:
> > >
> > > > kretprobe uses a list protected by a single lock to allocate a
> > > > kretprobe_instance in pre_handler_kretprobe(). This works poorly with
> > > > concurrent calls.
> > >
> > > Yes, there are several potential performance issue and the recycle
> > > instance is one of them. However, I think this spinlock is not so racy,
> > > but noisy (especially on many core machine) right?
> >
> > correct, it is especially painful on 2+ sockets and many-core systems
> > when attaching kretprobes on otherwise uncontended paths.
> >
> > >
> > > Racy lock is the kretprobe_hash_lock(), I would like to replace it
> > > with ftrace's per-task shadow stack. But that will be available
> > > only if CONFIG_FUNCTION_GRAPH_TRACER=y (and instance has no own
> > > payload).
> > >
> > > > This patch offers a simplified fix: the percpu_instance flag indicates
> > > > that we allocate one instance per CPU, and the allocation is contention
> > > > free, but we allow only have one pending entry per CPU (this could be
> > > > extended to a small constant number without much trouble).
> > >
> > > OK, the percpu instance idea is good to me, and I think it should be
> > > default option. Unless user specifies the number of instances, it should
> > > choose percpu instance by default.
> >
> > That was my initial implementation, which would not even need the
> > percpu_instance
> > flag in struct kretprobe. However, I felt that changing the default
> > would have subtle
> > side effects (e.g., only one outstanding call per CPU) so I thought it
> > would be better
> > to leave the default unchanged and make the flag explicit.
> >
> > > Moreover, this makes things a bit complicated, can you add per-cpu
> > > instance array? If it is there, we can remove the old recycle rp insn
> > > code.
> >
> > Can you clarify what you mean by "per-cpu instance array" ?
> > Do you mean allowing multiple outstanding entries per cpu?
>
> Yes, either allocating it on percpu area or allocating arraies
> on percpu pointer is OK. e.g.
>
> instance_size = sizeof(*rp->pcpu) + rp->data_size;
> rp->pcpu = __alloc_percpu(instance_size * array_size,
> __alignof__(*rp->pcpu));
>
> And we will search free ri on the percpu array by checking ri->rp == NULL.

I have posted a v4 patch with the refactoring you suggested, but
still defaulting to non percpu allocation, and only one entry per cpu.
The former to avoid potential regressions, the latter because I worry
that the search in the array may incur several cache misses especially
if the traced function is allowed to block or the caller can migrate.
(Maybe I am over cautious, but I want to measure that cost first;
once that is clear perhaps we can move forward with another patch
that defaults to percpu and removes the reclaim code).

cheers
luigi