2020-02-21 21:17:49

by Luigi Rizzo

[permalink] [raw]
Subject: [PATCH v4] kretprobe: percpu support

kretprobe uses a list protected by a single lock to allocate a
kretprobe_instance in pre_handler_kretprobe(). This works poorly with
concurrent calls.

This patch offers a simplified fix: the percpu_instance flag indicates
that kretprobe_instance's are per CPU, which makes allocation lock free.

As part of this patch, we factor out the code to allocate instances in
get_pcpu_rp_instance() and get_rp_instance().

At the moment we only allow one pending kretprobe per CPU. This can be
extended to a small constant number of entries, but finding a free entry
would either bring back the lock, or require scanning an array, which can
be expensive (especially if callers block and migrate).

Signed-off-by: Luigi Rizzo <[email protected]>
---
include/linux/kprobes.h | 8 +++-
kernel/kprobes.c | 102 +++++++++++++++++++++++++++++++---------
kernel/test_kprobes.c | 20 ++++++--
3 files changed, 101 insertions(+), 29 deletions(-)

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 04bdaf01112cb..c5894f4c5c26c 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -142,6 +142,8 @@ static inline int kprobe_ftrace(struct kprobe *p)
* can be active concurrently.
* nmissed - tracks the number of times the probed function's return was
* ignored, due to maxactive being too low.
+ * percpu_instance - if set, uses one instance per cpu instead of allocating
+ * from the list protected by lock.
*
*/
struct kretprobe {
@@ -151,8 +153,12 @@ struct kretprobe {
int maxactive;
int nmissed;
size_t data_size;
- struct hlist_head free_instances;
+ union {
+ struct kretprobe_instance __percpu *pcpu;
+ struct hlist_head free_instances;
+ };
raw_spinlock_t lock;
+ bool percpu_instance;
};

struct kretprobe_instance {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2625c241ac00f..46018a4a76178 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1184,6 +1184,10 @@ void recycle_rp_inst(struct kretprobe_instance *ri,
hlist_del(&ri->hlist);
INIT_HLIST_NODE(&ri->hlist);
if (likely(rp)) {
+ if (rp->percpu_instance) {
+ ri->rp = NULL;
+ return;
+ }
raw_spin_lock(&rp->lock);
hlist_add_head(&ri->hlist, &rp->free_instances);
raw_spin_unlock(&rp->lock);
@@ -1274,6 +1278,11 @@ static inline void free_rp_inst(struct kretprobe *rp)
struct kretprobe_instance *ri;
struct hlist_node *next;

+ if (rp->percpu_instance) {
+ free_percpu(rp->pcpu);
+ return;
+ }
+
hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
@@ -1851,6 +1860,46 @@ unsigned long __weak arch_deref_entry_point(void *entry)
}

#ifdef CONFIG_KRETPROBES
+struct kretprobe_instance *get_pcpu_rp_instance(struct kretprobe *rp)
+{
+ struct kretprobe_instance *ri;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ri = this_cpu_ptr(rp->pcpu);
+ if (!ri || ri->rp) { /* already in use */
+ local_irq_restore(flags);
+ rp->nmissed++;
+ return NULL;
+ }
+ INIT_HLIST_NODE(&ri->hlist);
+ ri->rp = rp;
+ ri->task = current;
+ local_irq_restore(flags);
+ return ri;
+}
+
+struct kretprobe_instance *get_rp_instance(struct kretprobe *rp)
+{
+ struct kretprobe_instance *ri;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rp->lock, flags);
+ if (hlist_empty(&rp->free_instances)) {
+ rp->nmissed++;
+ raw_spin_unlock_irqrestore(&rp->lock, flags);
+ return NULL;
+ }
+ ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance,
+ hlist);
+ hlist_del(&ri->hlist);
+ raw_spin_unlock_irqrestore(&rp->lock, flags);
+
+ ri->rp = rp;
+ ri->task = current;
+ return ri;
+}
+
/*
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
@@ -1873,35 +1922,32 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
}

/* TODO: consider to only swap the RA after the last pre_handler fired */
- hash = hash_ptr(current, KPROBE_HASH_BITS);
- raw_spin_lock_irqsave(&rp->lock, flags);
- if (!hlist_empty(&rp->free_instances)) {
- ri = hlist_entry(rp->free_instances.first,
- struct kretprobe_instance, hlist);
- hlist_del(&ri->hlist);
- raw_spin_unlock_irqrestore(&rp->lock, flags);
-
- ri->rp = rp;
- ri->task = current;
-
- if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ if (rp->percpu_instance) {
+ ri = get_pcpu_rp_instance(rp);
+ } else {
+ ri = get_rp_instance(rp);
+ }
+ if (!ri)
+ return 0;
+ if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ if (rp->percpu_instance) {
+ ri->rp = NULL;
+ } else {
raw_spin_lock_irqsave(&rp->lock, flags);
hlist_add_head(&ri->hlist, &rp->free_instances);
raw_spin_unlock_irqrestore(&rp->lock, flags);
- return 0;
}
+ return 0;
+ }
+ arch_prepare_kretprobe(ri, regs);

- arch_prepare_kretprobe(ri, regs);
+ /* XXX(hch): why is there no hlist_move_head? */
+ INIT_HLIST_NODE(&ri->hlist);
+ hash = hash_ptr(current, KPROBE_HASH_BITS);
+ kretprobe_table_lock(hash, &flags);
+ hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
+ kretprobe_table_unlock(hash, &flags);

- /* XXX(hch): why is there no hlist_move_head? */
- INIT_HLIST_NODE(&ri->hlist);
- kretprobe_table_lock(hash, &flags);
- hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
- kretprobe_table_unlock(hash, &flags);
- } else {
- rp->nmissed++;
- raw_spin_unlock_irqrestore(&rp->lock, flags);
- }
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
@@ -1950,6 +1996,15 @@ int register_kretprobe(struct kretprobe *rp)
rp->kp.post_handler = NULL;
rp->kp.fault_handler = NULL;

+ if (rp->percpu_instance) {
+ rp->pcpu = __alloc_percpu(sizeof(*rp->pcpu) + rp->data_size,
+ __alignof__(*rp->pcpu));
+ if (rp->pcpu)
+ goto finalize;
+ free_rp_inst(rp);
+ return -ENOMEM;
+ }
+
/* Pre-allocate memory for max kretprobe instances */
if (rp->maxactive <= 0) {
#ifdef CONFIG_PREEMPTION
@@ -1971,6 +2026,7 @@ int register_kretprobe(struct kretprobe *rp)
hlist_add_head(&inst->hlist, &rp->free_instances);
}

+finalize:
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 76c997fdbc9da..4ebe48301d743 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -236,31 +236,36 @@ static struct kretprobe rp2 = {
.kp.symbol_name = "kprobe_target2"
};

-static int test_kretprobes(void)
+static int test_kretprobes(bool percpu_instance)
{
int ret;
struct kretprobe *rps[2] = {&rp, &rp2};
+ const char *mode = percpu_instance ? "percpu " : "normal";

/* addr and flags should be cleard for reusing kprobe. */
rp.kp.addr = NULL;
rp.kp.flags = 0;
+ rp.percpu_instance = percpu_instance;
+ rp2.kp.addr = NULL;
+ rp2.kp.flags = 0;
+ rp2.percpu_instance = percpu_instance;
ret = register_kretprobes(rps, 2);
if (ret < 0) {
- pr_err("register_kretprobe returned %d\n", ret);
+ pr_err("register_kretprobe mode %s returned %d\n", mode, ret);
return ret;
}

krph_val = 0;
ret = target(rand1);
if (krph_val != rand1) {
- pr_err("kretprobe handler not called\n");
+ pr_err("kretprobe handler mode %s not called\n", mode);
handler_errors++;
}

krph_val = 0;
ret = target2(rand1);
if (krph_val != rand1) {
- pr_err("kretprobe handler2 not called\n");
+ pr_err("kretprobe handler2 mode %s not called\n", mode);
handler_errors++;
}
unregister_kretprobes(rps, 2);
@@ -297,7 +302,12 @@ int init_test_probes(void)
errors++;

num_tests++;
- ret = test_kretprobes();
+ ret = test_kretprobes(false);
+ if (ret < 0)
+ errors++;
+
+ num_tests++;
+ ret = test_kretprobes(true);
if (ret < 0)
errors++;
#endif /* CONFIG_KRETPROBES */
--
2.25.0.265.gbab2e86ba0-goog


2020-03-05 06:43:21

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH v4] kretprobe: percpu support

Hi Luigi,

On Fri, 21 Feb 2020 13:16:57 -0800
Luigi Rizzo <[email protected]> wrote:

> kretprobe uses a list protected by a single lock to allocate a
> kretprobe_instance in pre_handler_kretprobe(). This works poorly with
> concurrent calls.
>
> This patch offers a simplified fix: the percpu_instance flag indicates
> that kretprobe_instance's are per CPU, which makes allocation lock free.

I concern that your new percpu_instance flag silently hides the
maxactive parameter. At least we should return an error if maxactive
is some positive value but percpu_instance is true. Or, define
KRETPROBE_PERCPU_MAXACTIVE (< 0) and use it instead of new bool flag.

> As part of this patch, we factor out the code to allocate instances in
> get_pcpu_rp_instance() and get_rp_instance().
>
> At the moment we only allow one pending kretprobe per CPU. This can be
> extended to a small constant number of entries, but finding a free entry
> would either bring back the lock, or require scanning an array, which can
> be expensive (especially if callers block and migrate).

I think if you disables irq while scanning an array (that should be
a small array), you don't need to afraid of such racing (maybe we need
a pair of memory barriers).

Anyway, my concern is about introducing a new knob which conflict with
another knob. Implementation is good to me.

Thank you,


>
> Signed-off-by: Luigi Rizzo <[email protected]>
> ---
> include/linux/kprobes.h | 8 +++-
> kernel/kprobes.c | 102 +++++++++++++++++++++++++++++++---------
> kernel/test_kprobes.c | 20 ++++++--
> 3 files changed, 101 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
> index 04bdaf01112cb..c5894f4c5c26c 100644
> --- a/include/linux/kprobes.h
> +++ b/include/linux/kprobes.h
> @@ -142,6 +142,8 @@ static inline int kprobe_ftrace(struct kprobe *p)
> * can be active concurrently.
> * nmissed - tracks the number of times the probed function's return was
> * ignored, due to maxactive being too low.
> + * percpu_instance - if set, uses one instance per cpu instead of allocating
> + * from the list protected by lock.
> *
> */
> struct kretprobe {
> @@ -151,8 +153,12 @@ struct kretprobe {
> int maxactive;
> int nmissed;
> size_t data_size;
> - struct hlist_head free_instances;
> + union {
> + struct kretprobe_instance __percpu *pcpu;
> + struct hlist_head free_instances;
> + };
> raw_spinlock_t lock;
> + bool percpu_instance;
> };
>
> struct kretprobe_instance {
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 2625c241ac00f..46018a4a76178 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -1184,6 +1184,10 @@ void recycle_rp_inst(struct kretprobe_instance *ri,
> hlist_del(&ri->hlist);
> INIT_HLIST_NODE(&ri->hlist);
> if (likely(rp)) {
> + if (rp->percpu_instance) {
> + ri->rp = NULL;
> + return;
> + }
> raw_spin_lock(&rp->lock);
> hlist_add_head(&ri->hlist, &rp->free_instances);
> raw_spin_unlock(&rp->lock);
> @@ -1274,6 +1278,11 @@ static inline void free_rp_inst(struct kretprobe *rp)
> struct kretprobe_instance *ri;
> struct hlist_node *next;
>
> + if (rp->percpu_instance) {
> + free_percpu(rp->pcpu);
> + return;
> + }
> +
> hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
> hlist_del(&ri->hlist);
> kfree(ri);
> @@ -1851,6 +1860,46 @@ unsigned long __weak arch_deref_entry_point(void *entry)
> }
>
> #ifdef CONFIG_KRETPROBES
> +struct kretprobe_instance *get_pcpu_rp_instance(struct kretprobe *rp)
> +{
> + struct kretprobe_instance *ri;
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + ri = this_cpu_ptr(rp->pcpu);
> + if (!ri || ri->rp) { /* already in use */
> + local_irq_restore(flags);
> + rp->nmissed++;
> + return NULL;
> + }
> + INIT_HLIST_NODE(&ri->hlist);
> + ri->rp = rp;
> + ri->task = current;
> + local_irq_restore(flags);
> + return ri;
> +}
> +
> +struct kretprobe_instance *get_rp_instance(struct kretprobe *rp)
> +{
> + struct kretprobe_instance *ri;
> + unsigned long flags;
> +
> + raw_spin_lock_irqsave(&rp->lock, flags);
> + if (hlist_empty(&rp->free_instances)) {
> + rp->nmissed++;
> + raw_spin_unlock_irqrestore(&rp->lock, flags);
> + return NULL;
> + }
> + ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance,
> + hlist);
> + hlist_del(&ri->hlist);
> + raw_spin_unlock_irqrestore(&rp->lock, flags);
> +
> + ri->rp = rp;
> + ri->task = current;
> + return ri;
> +}
> +
> /*
> * This kprobe pre_handler is registered with every kretprobe. When probe
> * hits it will set up the return probe.
> @@ -1873,35 +1922,32 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
> }
>
> /* TODO: consider to only swap the RA after the last pre_handler fired */
> - hash = hash_ptr(current, KPROBE_HASH_BITS);
> - raw_spin_lock_irqsave(&rp->lock, flags);
> - if (!hlist_empty(&rp->free_instances)) {
> - ri = hlist_entry(rp->free_instances.first,
> - struct kretprobe_instance, hlist);
> - hlist_del(&ri->hlist);
> - raw_spin_unlock_irqrestore(&rp->lock, flags);
> -
> - ri->rp = rp;
> - ri->task = current;
> -
> - if (rp->entry_handler && rp->entry_handler(ri, regs)) {
> + if (rp->percpu_instance) {
> + ri = get_pcpu_rp_instance(rp);
> + } else {
> + ri = get_rp_instance(rp);
> + }
> + if (!ri)
> + return 0;
> + if (rp->entry_handler && rp->entry_handler(ri, regs)) {
> + if (rp->percpu_instance) {
> + ri->rp = NULL;
> + } else {
> raw_spin_lock_irqsave(&rp->lock, flags);
> hlist_add_head(&ri->hlist, &rp->free_instances);
> raw_spin_unlock_irqrestore(&rp->lock, flags);
> - return 0;
> }
> + return 0;
> + }
> + arch_prepare_kretprobe(ri, regs);
>
> - arch_prepare_kretprobe(ri, regs);
> + /* XXX(hch): why is there no hlist_move_head? */
> + INIT_HLIST_NODE(&ri->hlist);
> + hash = hash_ptr(current, KPROBE_HASH_BITS);
> + kretprobe_table_lock(hash, &flags);
> + hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
> + kretprobe_table_unlock(hash, &flags);
>
> - /* XXX(hch): why is there no hlist_move_head? */
> - INIT_HLIST_NODE(&ri->hlist);
> - kretprobe_table_lock(hash, &flags);
> - hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
> - kretprobe_table_unlock(hash, &flags);
> - } else {
> - rp->nmissed++;
> - raw_spin_unlock_irqrestore(&rp->lock, flags);
> - }
> return 0;
> }
> NOKPROBE_SYMBOL(pre_handler_kretprobe);
> @@ -1950,6 +1996,15 @@ int register_kretprobe(struct kretprobe *rp)
> rp->kp.post_handler = NULL;
> rp->kp.fault_handler = NULL;
>
> + if (rp->percpu_instance) {
> + rp->pcpu = __alloc_percpu(sizeof(*rp->pcpu) + rp->data_size,
> + __alignof__(*rp->pcpu));
> + if (rp->pcpu)
> + goto finalize;
> + free_rp_inst(rp);
> + return -ENOMEM;
> + }
> +
> /* Pre-allocate memory for max kretprobe instances */
> if (rp->maxactive <= 0) {
> #ifdef CONFIG_PREEMPTION
> @@ -1971,6 +2026,7 @@ int register_kretprobe(struct kretprobe *rp)
> hlist_add_head(&inst->hlist, &rp->free_instances);
> }
>
> +finalize:
> rp->nmissed = 0;
> /* Establish function entry probe point */
> ret = register_kprobe(&rp->kp);
> diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
> index 76c997fdbc9da..4ebe48301d743 100644
> --- a/kernel/test_kprobes.c
> +++ b/kernel/test_kprobes.c
> @@ -236,31 +236,36 @@ static struct kretprobe rp2 = {
> .kp.symbol_name = "kprobe_target2"
> };
>
> -static int test_kretprobes(void)
> +static int test_kretprobes(bool percpu_instance)
> {
> int ret;
> struct kretprobe *rps[2] = {&rp, &rp2};
> + const char *mode = percpu_instance ? "percpu " : "normal";
>
> /* addr and flags should be cleard for reusing kprobe. */
> rp.kp.addr = NULL;
> rp.kp.flags = 0;
> + rp.percpu_instance = percpu_instance;
> + rp2.kp.addr = NULL;
> + rp2.kp.flags = 0;
> + rp2.percpu_instance = percpu_instance;
> ret = register_kretprobes(rps, 2);
> if (ret < 0) {
> - pr_err("register_kretprobe returned %d\n", ret);
> + pr_err("register_kretprobe mode %s returned %d\n", mode, ret);
> return ret;
> }
>
> krph_val = 0;
> ret = target(rand1);
> if (krph_val != rand1) {
> - pr_err("kretprobe handler not called\n");
> + pr_err("kretprobe handler mode %s not called\n", mode);
> handler_errors++;
> }
>
> krph_val = 0;
> ret = target2(rand1);
> if (krph_val != rand1) {
> - pr_err("kretprobe handler2 not called\n");
> + pr_err("kretprobe handler2 mode %s not called\n", mode);
> handler_errors++;
> }
> unregister_kretprobes(rps, 2);
> @@ -297,7 +302,12 @@ int init_test_probes(void)
> errors++;
>
> num_tests++;
> - ret = test_kretprobes();
> + ret = test_kretprobes(false);
> + if (ret < 0)
> + errors++;
> +
> + num_tests++;
> + ret = test_kretprobes(true);
> if (ret < 0)
> errors++;
> #endif /* CONFIG_KRETPROBES */
> --
> 2.25.0.265.gbab2e86ba0-goog
>


--
Masami Hiramatsu <[email protected]>

2020-03-05 14:47:18

by Masami Hiramatsu

[permalink] [raw]
Subject: Re: [PATCH v4] kretprobe: percpu support

On Thu, 5 Mar 2020 12:05:42 +0100
Luigi Rizzo <[email protected]> wrote:

> > > As part of this patch, we factor out the code to allocate instances in
> > > get_pcpu_rp_instance() and get_rp_instance().
> > >
> > > At the moment we only allow one pending kretprobe per CPU. This can be
> > > extended to a small constant number of entries, but finding a free entry
> > > would either bring back the lock, or require scanning an array, which can
> > > be expensive (especially if callers block and migrate).
> >
> > I think if you disables irq while scanning an array (that should be
> > a small array), you don't need to afraid of such racing (maybe we need
> > a pair of memory barriers).
> >
>
> To be clear, I was not concerned by races (irq disabled solve that, worst
> case we'll miss an entry being freed by another core while we scan). The
> cost I worried about was when we have many busy entries which can possibly
> be out of the local cache of cpu X eg because the thread that grabbed the
> entry moved to another cpu Y and is updating the record there. This can be
> partially mitigated by putting the user block in a different cache line so
> the cache conflict will happen only once on release.

But how much the cost is? Would you have any actual probe point and workload
about your concerning usecases? I think we can use perf to measure actual cost.

I would like to know the actual benefit of this change.
This is important because, in the future, if someone has another idea to
fix your concern, how I can judge it?

Thank you,

--
Masami Hiramatsu <[email protected]>