2020-03-05 22:14:08

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH linus/master 1/2] rcuperf: Add ability to increase object allocation size

This allows us to increase memory pressure dynamically using a new
rcuperf boot command line parameter called 'rcumult'.

Signed-off-by: Joel Fernandes (Google) <[email protected]>

---
kernel/rcu/rcuperf.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index da94b89cd5310..36f0ed75c7cf3 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -87,6 +87,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?");
+torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");

static char *perf_type = "rcu";
module_param(perf_type, charp, 0444);
@@ -627,7 +628,7 @@ kfree_perf_thread(void *arg)

do {
for (i = 0; i < kfree_alloc_num; i++) {
- alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
+ alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL);
if (!alloc_ptr)
return -ENOMEM;

@@ -712,6 +713,8 @@ kfree_perf_init(void)
schedule_timeout_uninterruptible(1);
}

+ pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj));
+
kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
GFP_KERNEL);
if (kfree_reader_tasks == NULL) {
--
2.25.0.265.gbab2e86ba0-goog


2020-03-05 22:14:11

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH linus/master 2/2] rcu/tree: Add a shrinker to prevent OOM due to kfree_rcu() batching

To reduce grace periods and improve kfree() performance, we have done
batching recently dramatically bringing down the number of grace periods
while giving us the ability to use kfree_bulk() for efficient kfree'ing.

However, this has increased the likelihood of OOM condition under heavy
kfree_rcu() flood on small memory systems. This patch introduces a
shrinker which starts grace periods right away if the system is under
memory pressure due to existence of objects that have still not started
a grace period.

With this patch, I do not observe an OOM anymore on a system with 512MB
RAM and 8 CPUs, with the following rcuperf options:

rcuperf.kfree_loops=20000 rcuperf.kfree_alloc_num=8000
rcuperf.kfree_rcu_test=1 rcuperf.kfree_mult=2

NOTE:
On systems with no memory pressure, the patch has no effect as intended.

Cc: [email protected]
Signed-off-by: Joel Fernandes (Google) <[email protected]>

---
kernel/rcu/tree.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 58 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d91c9156fab2e..28ec35e15529d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2723,6 +2723,8 @@ struct kfree_rcu_cpu {
struct delayed_work monitor_work;
bool monitor_todo;
bool initialized;
+ // Number of objects for which GP not started
+ int count;
};

static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
@@ -2791,6 +2793,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)

krwp->head_free = krcp->head;
krcp->head = NULL;
+ krcp->count = 0;
INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
queue_rcu_work(system_wq, &krwp->rcu_work);
return true;
@@ -2864,6 +2867,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
head->func = func;
head->next = krcp->head;
krcp->head = head;
+ krcp->count++;

// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
@@ -2879,6 +2883,58 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);

+static unsigned long
+kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long flags, count = 0;
+
+ /* Snapshot count of all CPUs */
+ for_each_online_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ spin_lock_irqsave(&krcp->lock, flags);
+ count += krcp->count;
+ spin_unlock_irqrestore(&krcp->lock, flags);
+ }
+
+ return count;
+}
+
+static unsigned long
+kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu, freed = 0;
+ unsigned long flags;
+
+ for_each_online_cpu(cpu) {
+ int count;
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ count = krcp->count;
+ spin_lock_irqsave(&krcp->lock, flags);
+ if (krcp->monitor_todo)
+ kfree_rcu_drain_unlock(krcp, flags);
+ else
+ spin_unlock_irqrestore(&krcp->lock, flags);
+
+ sc->nr_to_scan -= count;
+ freed += count;
+
+ if (sc->nr_to_scan <= 0)
+ break;
+ }
+
+ return freed;
+}
+
+static struct shrinker kfree_rcu_shrinker = {
+ .count_objects = kfree_rcu_shrink_count,
+ .scan_objects = kfree_rcu_shrink_scan,
+ .batch = 0,
+ .seeks = DEFAULT_SEEKS,
+};
+
void __init kfree_rcu_scheduler_running(void)
{
int cpu;
@@ -3774,6 +3830,8 @@ static void __init kfree_rcu_batch_init(void)
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
krcp->initialized = true;
}
+ if (register_shrinker(&kfree_rcu_shrinker))
+ pr_err("Failed to register kfree_rcu() shrinker!\n");
}

void __init rcu_init(void)
--
2.25.0.265.gbab2e86ba0-goog

2020-03-05 22:18:47

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH linus/master 2/2] rcu/tree: Add a shrinker to prevent OOM due to kfree_rcu() batching

On Thu, Mar 05, 2020 at 05:13:23PM -0500, Joel Fernandes (Google) wrote:
> To reduce grace periods and improve kfree() performance, we have done
> batching recently dramatically bringing down the number of grace periods
> while giving us the ability to use kfree_bulk() for efficient kfree'ing.
>
> However, this has increased the likelihood of OOM condition under heavy
> kfree_rcu() flood on small memory systems. This patch introduces a
> shrinker which starts grace periods right away if the system is under
> memory pressure due to existence of objects that have still not started
> a grace period.
>
> With this patch, I do not observe an OOM anymore on a system with 512MB
> RAM and 8 CPUs, with the following rcuperf options:
>
> rcuperf.kfree_loops=20000 rcuperf.kfree_alloc_num=8000
> rcuperf.kfree_rcu_test=1 rcuperf.kfree_mult=2

Paul,
I may have to rebase this patch on top of Vlad's kfree_bulk() work. But let
us discuss patch and I can rebase it and repost it once patch looks Ok to
you. (The kfree_bulk() work should not affect the patch).

thanks,

- Joel


>
> NOTE:
> On systems with no memory pressure, the patch has no effect as intended.
>
> Cc: [email protected]
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
>
> ---
> kernel/rcu/tree.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 58 insertions(+)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index d91c9156fab2e..28ec35e15529d 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2723,6 +2723,8 @@ struct kfree_rcu_cpu {
> struct delayed_work monitor_work;
> bool monitor_todo;
> bool initialized;
> + // Number of objects for which GP not started
> + int count;
> };
>
> static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
> @@ -2791,6 +2793,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
>
> krwp->head_free = krcp->head;
> krcp->head = NULL;
> + krcp->count = 0;
> INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
> queue_rcu_work(system_wq, &krwp->rcu_work);
> return true;
> @@ -2864,6 +2867,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> head->func = func;
> head->next = krcp->head;
> krcp->head = head;
> + krcp->count++;
>
> // Set timer to drain after KFREE_DRAIN_JIFFIES.
> if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> @@ -2879,6 +2883,58 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> }
> EXPORT_SYMBOL_GPL(kfree_call_rcu);
>
> +static unsigned long
> +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
> +{
> + int cpu;
> + unsigned long flags, count = 0;
> +
> + /* Snapshot count of all CPUs */
> + for_each_online_cpu(cpu) {
> + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> +
> + spin_lock_irqsave(&krcp->lock, flags);
> + count += krcp->count;
> + spin_unlock_irqrestore(&krcp->lock, flags);
> + }
> +
> + return count;
> +}
> +
> +static unsigned long
> +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> +{
> + int cpu, freed = 0;
> + unsigned long flags;
> +
> + for_each_online_cpu(cpu) {
> + int count;
> + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> +
> + count = krcp->count;
> + spin_lock_irqsave(&krcp->lock, flags);
> + if (krcp->monitor_todo)
> + kfree_rcu_drain_unlock(krcp, flags);
> + else
> + spin_unlock_irqrestore(&krcp->lock, flags);
> +
> + sc->nr_to_scan -= count;
> + freed += count;
> +
> + if (sc->nr_to_scan <= 0)
> + break;
> + }
> +
> + return freed;
> +}
> +
> +static struct shrinker kfree_rcu_shrinker = {
> + .count_objects = kfree_rcu_shrink_count,
> + .scan_objects = kfree_rcu_shrink_scan,
> + .batch = 0,
> + .seeks = DEFAULT_SEEKS,
> +};
> +
> void __init kfree_rcu_scheduler_running(void)
> {
> int cpu;
> @@ -3774,6 +3830,8 @@ static void __init kfree_rcu_batch_init(void)
> INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> krcp->initialized = true;
> }
> + if (register_shrinker(&kfree_rcu_shrinker))
> + pr_err("Failed to register kfree_rcu() shrinker!\n");
> }
>
> void __init rcu_init(void)
> --
> 2.25.0.265.gbab2e86ba0-goog
>

2020-03-05 22:26:36

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH linus/master 2/2] rcu/tree: Add a shrinker to prevent OOM due to kfree_rcu() batching

On Thu, Mar 05, 2020 at 05:17:53PM -0500, Joel Fernandes wrote:
> On Thu, Mar 05, 2020 at 05:13:23PM -0500, Joel Fernandes (Google) wrote:
> > To reduce grace periods and improve kfree() performance, we have done
> > batching recently dramatically bringing down the number of grace periods
> > while giving us the ability to use kfree_bulk() for efficient kfree'ing.
> >
> > However, this has increased the likelihood of OOM condition under heavy
> > kfree_rcu() flood on small memory systems. This patch introduces a
> > shrinker which starts grace periods right away if the system is under
> > memory pressure due to existence of objects that have still not started
> > a grace period.
> >
> > With this patch, I do not observe an OOM anymore on a system with 512MB
> > RAM and 8 CPUs, with the following rcuperf options:
> >
> > rcuperf.kfree_loops=20000 rcuperf.kfree_alloc_num=8000
> > rcuperf.kfree_rcu_test=1 rcuperf.kfree_mult=2
>
> Paul,
> I may have to rebase this patch on top of Vlad's kfree_bulk() work. But let
> us discuss patch and I can rebase it and repost it once patch looks Ok to
> you. (The kfree_bulk() work should not affect the patch).

BTW, we can also use the scheme in the future to keep garbage uncollected
until memory pressure. That way you defer grace periods for longer similar to
the paper [1], until the MM layer thinks the party is over. For one, I am not
too confident about the shrinker's ability to handle transient memory spikes.
If I remember, the shrinker is best-effort.

But one step at a time :)

thanks,

- Joel

[1] https://dl.acm.org/doi/10.1145/3190508.3190522