Recently a discussion about performance of system involving a high rate
of kfree_rcu() calls surfaced on the list [1] which led to another
discussion how to prepare for this situation.
This patch adds basic batching support for kfree_rcu. It is "basic"
because we do none of the slab management, dynamic allocation, code
moving or any of the other things, some of which previous attempts did
[2]. These fancier improvements can be follow-up patches and there are
several ideas being experimented in those regards.
Torture tests follow in the next patch and show improvements of around
~13% with continuous flooding of kfree_rcu() calls on a 16 CPU system.
[1] http://lore.kernel.org/lkml/[email protected]
[2] https://lkml.org/lkml/2017/12/19/824
This is an effort just to start simple, and build up from there.
Cc: Rao Shoaib <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Co-developed-by: Byungchul Park <[email protected]>
Signed-off-by: Byungchul Park <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
kernel/rcu/tree.c | 198 ++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 193 insertions(+), 5 deletions(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a14e5fbbea46..bdbd483606ce 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2593,19 +2593,194 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(call_rcu);
+
+/* Maximum number of jiffies to wait before draining batch */
+#define KFREE_DRAIN_JIFFIES 50
+
+/*
+ * Maximum number of kfree(s) to batch, if limit is hit
+ * then RCU work is queued right away
+ */
+#define KFREE_MAX_BATCH 200000ULL
+
+struct kfree_rcu_cpu {
+ /* The work done to free objects after GP */
+ struct rcu_work rcu_work;
+
+ /* The list of objects being queued */
+ struct rcu_head *head;
+ int kfree_batch_len;
+
+ /* The list of objects pending a free */
+ struct rcu_head *head_free;
+
+ /* Protect concurrent access to this structure */
+ spinlock_t lock;
+
+ /* The work done to monitor whether objects need free */
+ struct delayed_work monitor_work;
+ bool monitor_todo;
+};
+
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+
+/* Free all heads after a grace period (worker function) */
+static void kfree_rcu_work(struct work_struct *work)
+{
+ unsigned long flags;
+ struct rcu_head *head, *next;
+ struct kfree_rcu_cpu *krc = container_of(to_rcu_work(work),
+ struct kfree_rcu_cpu, rcu_work);
+
+ spin_lock_irqsave(&krc->lock, flags);
+ head = krc->head_free;
+ krc->head_free = NULL;
+ spin_unlock_irqrestore(&krc->lock, flags);
+
+ /* The head must be detached and not referenced from anywhere */
+ for (; head; head = next) {
+ next = head->next;
+ head->next = NULL;
+ /* Could be possible to optimize with kfree_bulk in future */
+ __rcu_reclaim(rcu_state.name, head);
+ }
+}
+
+/*
+ * Schedule the kfree batch RCU work to run after GP.
+ *
+ * Either the batch reached its maximum size, or the monitor's
+ * time reached, either way schedule the batch work.
+ */
+static bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krc)
+{
+ lockdep_assert_held(&krc->lock);
+
+ /*
+ * Someone already drained, probably before the monitor's worker
+ * thread ran. Just return to avoid useless work.
+ */
+ if (!krc->head)
+ return true;
+
+ /*
+ * If RCU batch work already in progress, we cannot
+ * queue another one, just refuse the optimization.
+ */
+ if (krc->head_free)
+ return false;
+
+ krc->head_free = krc->head;
+ krc->head = NULL;
+ krc->kfree_batch_len = 0;
+ INIT_RCU_WORK(&krc->rcu_work, kfree_rcu_work);
+ queue_rcu_work(system_wq, &krc->rcu_work);
+
+ return true;
+}
+
+static void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krc,
+ unsigned long flags)
+{
+ struct rcu_head *head, *next;
+
+ /* It is time to do bulk reclaim after grace period */
+ krc->monitor_todo = false;
+ if (queue_kfree_rcu_work(krc)) {
+ spin_unlock_irqrestore(&krc->lock, flags);
+ return;
+ }
+
+ /*
+ * Use non-batch regular call_rcu for kfree_rcu in case things are too
+ * busy and batching of kfree_rcu could not be used.
+ */
+ head = krc->head;
+ krc->head = NULL;
+ krc->kfree_batch_len = 0;
+ spin_unlock_irqrestore(&krc->lock, flags);
+
+ for (; head; head = next) {
+ next = head->next;
+ head->next = NULL;
+ __call_rcu(head, head->func, -1, 1);
+ }
+}
+
+/*
+ * If enough time has passed, the kfree batch has to be drained
+ * and the monitor takes care of that.
+ */
+static void kfree_rcu_monitor(struct work_struct *work)
+{
+ bool todo;
+ unsigned long flags;
+ struct kfree_rcu_cpu *krc = container_of(work, struct kfree_rcu_cpu,
+ monitor_work.work);
+
+ /* It is time to do bulk reclaim after grace period */
+ spin_lock_irqsave(&krc->lock, flags);
+ todo = krc->monitor_todo;
+ krc->monitor_todo = false;
+ if (todo)
+ kfree_rcu_drain_unlock(krc, flags);
+ else
+ spin_unlock_irqrestore(&krc->lock, flags);
+}
+
+static void kfree_rcu_batch(struct rcu_head *head, rcu_callback_t func)
+{
+ unsigned long flags;
+ struct kfree_rcu_cpu *this_krc;
+ bool monitor_todo;
+
+ local_irq_save(flags);
+ this_krc = this_cpu_ptr(&krc);
+
+ spin_lock(&this_krc->lock);
+
+ /* Queue the kfree but don't yet schedule the batch */
+ head->func = func;
+ head->next = this_krc->head;
+ this_krc->head = head;
+ this_krc->kfree_batch_len++;
+
+ if (this_krc->kfree_batch_len == KFREE_MAX_BATCH) {
+ kfree_rcu_drain_unlock(this_krc, flags);
+ return;
+ }
+
+ /* Maximum has not been reached, schedule monitor for timely drain */
+ monitor_todo = this_krc->monitor_todo;
+ this_krc->monitor_todo = true;
+ spin_unlock(&this_krc->lock);
+
+ if (!monitor_todo) {
+ schedule_delayed_work_on(smp_processor_id(),
+ &this_krc->monitor_work, KFREE_DRAIN_JIFFIES);
+ }
+ local_irq_restore(flags);
+}
+
/*
* Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
*/
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func, -1, 1);
+ kfree_rcu_batch(head, func);
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);
+/*
+ * The version of kfree_call_rcu that does not do batching of kfree_rcu()
+ * requests. To be used only for performance testing comparisons with
+ * kfree_rcu_batch().
+ */
+void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
+{
+ __call_rcu(head, func, -1, 1);
+}
+
/*
* During early boot, any blocking grace-period wait automatically
* implies a grace period. Later on, this is never the case for PREEMPT.
@@ -3452,6 +3627,17 @@ static void __init rcu_dump_rcu_node_tree(void)
pr_cont("\n");
}
+void kfree_rcu_batch_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+ }
+}
+
struct workqueue_struct *rcu_gp_wq;
struct workqueue_struct *rcu_par_gp_wq;
@@ -3459,6 +3645,8 @@ void __init rcu_init(void)
{
int cpu;
+ kfree_rcu_batch_init();
+
rcu_early_boot_tests();
rcu_bootup_announce();
--
2.22.0.770.g0f2c4a37fd-goog
This test runs kfree_rcu in a loop to measure performance of the new
kfree_rcu, with and without patch.
To see improvement, run with boot parameters:
rcuperf.kfree_loops=2000 rcuperf.kfree_alloc_num=100 rcuperf.perf_type=kfree
Without patch, test runs in 6.9 seconds.
With patch, test runs in 6.1 seconds (+13% improvement)
If it is desired to run the test but with the traditional (non-batched)
kfree_rcu, for example to compare results, then you could pass along the
rcuperf.kfree_no_batch=1 boot parameter.
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
kernel/rcu/rcuperf.c | 169 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 168 insertions(+), 1 deletion(-)
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 7a6890b23c5f..34658760da5e 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable
static char *perf_type = "rcu";
module_param(perf_type, charp, 0444);
-MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)");
+MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, kfree,...)");
static int nrealreaders;
static int nrealwriters;
@@ -592,6 +592,170 @@ rcu_perf_shutdown(void *arg)
return -EINVAL;
}
+/*
+ * kfree_rcu performance tests: Start a kfree_rcu loop on all CPUs for number
+ * of iterations and measure total time for all iterations to complete.
+ */
+
+torture_param(int, kfree_nthreads, -1, "Number of RCU reader threads");
+torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done by a thread");
+torture_param(int, kfree_alloc_size, 16, "Size of each allocation");
+torture_param(int, kfree_loops, 10, "Size of each allocation");
+torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu");
+
+static struct task_struct **kfree_reader_tasks;
+static int kfree_nrealthreads;
+static atomic_t n_kfree_perf_thread_started;
+static atomic_t n_kfree_perf_thread_ended;
+
+#define KFREE_OBJ_BYTES 8
+
+struct kfree_obj {
+ char kfree_obj[KFREE_OBJ_BYTES];
+ struct rcu_head rh;
+};
+
+void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func);
+
+static int
+kfree_perf_thread(void *arg)
+{
+ int i, l = 0;
+ long me = (long)arg;
+ struct kfree_obj **alloc_ptrs;
+ u64 start_time, end_time;
+
+ VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+ atomic_inc(&n_kfree_perf_thread_started);
+
+ alloc_ptrs = (struct kfree_obj **)kmalloc(sizeof(struct kfree_obj *) * kfree_alloc_num,
+ GFP_KERNEL);
+ if (!alloc_ptrs)
+ return -ENOMEM;
+
+ start_time = ktime_get_mono_fast_ns();
+ do {
+ for (i = 0; i < kfree_alloc_num; i++) {
+ alloc_ptrs[i] = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
+ if (!alloc_ptrs[i])
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < kfree_alloc_num; i++) {
+ if (!kfree_no_batch) {
+ kfree_rcu(alloc_ptrs[i], rh);
+ } else {
+ rcu_callback_t cb;
+
+ cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh);
+ kfree_call_rcu_nobatch(&(alloc_ptrs[i]->rh), cb);
+ }
+ }
+
+ schedule_timeout_uninterruptible(2);
+ } while (!torture_must_stop() && ++l < kfree_loops);
+
+ kfree(alloc_ptrs);
+
+ if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) {
+ end_time = ktime_get_mono_fast_ns();
+ pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d\n",
+ (unsigned long long)(end_time - start_time), kfree_loops);
+ if (shutdown) {
+ smp_mb(); /* Assign before wake. */
+ wake_up(&shutdown_wq);
+ }
+ }
+
+ torture_kthread_stopping("kfree_perf_thread");
+ return 0;
+}
+
+static void
+kfree_perf_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (kfree_reader_tasks) {
+ for (i = 0; i < kfree_nrealthreads; i++)
+ torture_stop_kthread(kfree_perf_thread,
+ kfree_reader_tasks[i]);
+ kfree(kfree_reader_tasks);
+ }
+
+ torture_cleanup_end();
+}
+
+/*
+ * shutdown kthread. Just waits to be awakened, then shuts down system.
+ */
+static int
+kfree_perf_shutdown(void *arg)
+{
+ do {
+ wait_event(shutdown_wq,
+ atomic_read(&n_kfree_perf_thread_ended) >=
+ kfree_nrealthreads);
+ } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
+
+ smp_mb(); /* Wake before output. */
+
+ kfree_perf_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
+static int __init
+kfree_perf_init(void)
+{
+ long i;
+ int firsterr = 0;
+
+ if (!torture_init_begin("kfree_perf", verbose))
+ return -EBUSY;
+
+ kfree_nrealthreads = compute_real(kfree_nthreads);
+ /* Start up the kthreads. */
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(kfree_perf_shutdown, NULL,
+ shutdown_task);
+ if (firsterr)
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+
+ kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
+ GFP_KERNEL);
+ if (kfree_reader_tasks == NULL) {
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ for (i = 0; i < kfree_nrealthreads; i++) {
+ firsterr = torture_create_kthread(kfree_perf_thread, (void *)i,
+ kfree_reader_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+
+ while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads)
+ schedule_timeout_uninterruptible(1);
+
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ kfree_perf_cleanup();
+ return firsterr;
+}
+
static int __init
rcu_perf_init(void)
{
@@ -601,6 +765,9 @@ rcu_perf_init(void)
&rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops,
};
+ if (strcmp(perf_type, "kfree") == 0)
+ return kfree_perf_init();
+
if (!torture_init_begin(perf_type, verbose))
return -EBUSY;
--
2.22.0.770.g0f2c4a37fd-goog
On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> Recently a discussion about performance of system involving a high rate
> of kfree_rcu() calls surfaced on the list [1] which led to another
> discussion how to prepare for this situation.
>
> This patch adds basic batching support for kfree_rcu. It is "basic"
> because we do none of the slab management, dynamic allocation, code
> moving or any of the other things, some of which previous attempts did
> [2]. These fancier improvements can be follow-up patches and there are
> several ideas being experimented in those regards.
>
> Torture tests follow in the next patch and show improvements of around
> ~13% with continuous flooding of kfree_rcu() calls on a 16 CPU system.
>
> [1] http://lore.kernel.org/lkml/[email protected]
> [2] https://lkml.org/lkml/2017/12/19/824
>
> This is an effort just to start simple, and build up from there.
>
> Cc: Rao Shoaib <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Co-developed-by: Byungchul Park <[email protected]>
> Signed-off-by: Byungchul Park <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
Looks like a great start! Some comments and questions interspersed.
There are a number of fixes required for this to be ready for mainline,
but that is to be expected for v1.
Thanx, Paul
> ---
> kernel/rcu/tree.c | 198 ++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 193 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index a14e5fbbea46..bdbd483606ce 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2593,19 +2593,194 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
> }
> EXPORT_SYMBOL_GPL(call_rcu);
>
> +
> +/* Maximum number of jiffies to wait before draining batch */
> +#define KFREE_DRAIN_JIFFIES 50
> +
> +/*
> + * Maximum number of kfree(s) to batch, if limit is hit
> + * then RCU work is queued right away
> + */
> +#define KFREE_MAX_BATCH 200000ULL
> +
> +struct kfree_rcu_cpu {
> + /* The work done to free objects after GP */
> + struct rcu_work rcu_work;
If I didn't already know what a struct rcu_work was, the comment would
completely confuse me. The key point is that an rcu_work structure
is to queue_kfree_rcu_work() as rcu_head is to call_rcu().
All the header comments need to be complete sentences as well, including
the period at the end of the sentence.
Yes, I am sure that you find the existing comments in RCU just as
confusing, but all the more reason to make it easier on people reading
your code. ;-)
> + /* The list of objects being queued */
> + struct rcu_head *head;
So these are the ones that have been queued, but are not yet waiting
for a grace period, correct?
> + int kfree_batch_len;
This length applies only to the callbacks in ->head, and not to those
in ->head_free, yes?
> + /* The list of objects pending a free */
> + struct rcu_head *head_free;
And these are the ones waiting for a grace period, right?
> + /* Protect concurrent access to this structure */
> + spinlock_t lock;
> +
> + /* The work done to monitor whether objects need free */
> + struct delayed_work monitor_work;
This would be the callbacks in ->head, not those in ->head_free, right?
> + bool monitor_todo;
> +};
Either way, the comments need a bit of work.
> +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
> +
> +/* Free all heads after a grace period (worker function) */
Perhaps something like "This function is invoked in workqueue context
after a grace period. It frees all the objects queued on ->head_free."
The current "Frees all heads" is asking for confusion between ->head
and ->head_free.
> +static void kfree_rcu_work(struct work_struct *work)
> +{
> + unsigned long flags;
> + struct rcu_head *head, *next;
> + struct kfree_rcu_cpu *krc = container_of(to_rcu_work(work),
> + struct kfree_rcu_cpu, rcu_work);
> +
> + spin_lock_irqsave(&krc->lock, flags);
> + head = krc->head_free;
> + krc->head_free = NULL;
> + spin_unlock_irqrestore(&krc->lock, flags);
> +
> + /* The head must be detached and not referenced from anywhere */
> + for (; head; head = next) {
> + next = head->next;
> + head->next = NULL;
> + /* Could be possible to optimize with kfree_bulk in future */
> + __rcu_reclaim(rcu_state.name, head);
We need at least a cond_resched() here. If earlier experience of
a series of kfree() calls taking two milliseconds each holds up, a
cond_resched_tasks_rcu_qs() would also be necessary.
The two-milliseconds experience was with more than one hundred CPUs
concurrently doing kfree(), in case you were wondering why you hadn't
seen this in single-threaded operation.
Of course, I am hoping that a later patch uses an array of pointers built
at kfree_rcu() time, similar to Rao's patch (with or without kfree_bulk)
in order to reduce per-object cache-miss overhead. This would make it
easier for callback invocation to keep up with multi-CPU kfree_rcu()
floods.
> + }
> +}
> +
> +/*
> + * Schedule the kfree batch RCU work to run after GP.
Better! Still, please add the workqueue part, so that it is something
like "Schedule the kfree batch RCU work to run in workqueue context
after a GP."
> + *
> + * Either the batch reached its maximum size, or the monitor's
> + * time reached, either way schedule the batch work.
Perhaps something like "This function is invoked when the batch
has reached size KFREE_MAX_BATCH or when kfree_rcu_monitor() sees
that the KFREE_DRAIN_JIFFIES timeout has been reached."
> + */
> +static bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krc)
"krcp" would be more consistent with other RCU pointer naming. It would
also avoid the "this_" prefix below, whose purpose appears to be to
avoid confusion between the "krc" per-CPU variable and the "krc" pointer
currently in use.
> +{
> + lockdep_assert_held(&krc->lock);
> +
> + /*
> + * Someone already drained, probably before the monitor's worker
> + * thread ran. Just return to avoid useless work.
> + */
> + if (!krc->head)
> + return true;
Given that we held the lock across the earlier enqueue of a callback
onto ->head, how can ->head be NULL here? What am I missing?
> + /*
> + * If RCU batch work already in progress, we cannot
> + * queue another one, just refuse the optimization.
> + */
> + if (krc->head_free)
> + return false;
> +
> + krc->head_free = krc->head;
> + krc->head = NULL;
> + krc->kfree_batch_len = 0;
> + INIT_RCU_WORK(&krc->rcu_work, kfree_rcu_work);
> + queue_rcu_work(system_wq, &krc->rcu_work);
> +
> + return true;
> +}
> +
> +static void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krc,
> + unsigned long flags)
> +{
> + struct rcu_head *head, *next;
> +
> + /* It is time to do bulk reclaim after grace period */
> + krc->monitor_todo = false;
> + if (queue_kfree_rcu_work(krc)) {
> + spin_unlock_irqrestore(&krc->lock, flags);
> + return;
> + }
> +
> + /*
> + * Use non-batch regular call_rcu for kfree_rcu in case things are too
> + * busy and batching of kfree_rcu could not be used.
> + */
> + head = krc->head;
> + krc->head = NULL;
> + krc->kfree_batch_len = 0;
> + spin_unlock_irqrestore(&krc->lock, flags);
> +
> + for (; head; head = next) {
> + next = head->next;
> + head->next = NULL;
> + __call_rcu(head, head->func, -1, 1);
We need at least a cond_resched() here. 200,000 times through this loop
in a PREEMPT=n kernel might not always be pretty. Except that this is
invoked directly from kfree_rcu() which might be invoked with interrupts
disabled, which precludes calls to cond_resched(). So the realtime guys
are not going to be at all happy with this loop.
And this loop could be avoided entirely by having a third rcu_head list
in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
should be OK, or at least more OK than queuing 200,000 callbacks with
interrupts disabled. (If it turns out not to be OK, an array of rcu_head
pointers can be used to reduce the probability of oversized batches.)
This would also mean that the equality comparisons with KFREE_MAX_BATCH
need to become greater-or-equal comparisons or some such.
> + }
> +}
> +
> +/*
> + * If enough time has passed, the kfree batch has to be drained
> + * and the monitor takes care of that.
Perhaps something like: "This function is invoked after the
KFREE_DRAIN_JIFFIES timeout has elapse, so it drains the specified
kfree_rcu_cpu structure's ->head list."
> + */
> +static void kfree_rcu_monitor(struct work_struct *work)
> +{
> + bool todo;
> + unsigned long flags;
> + struct kfree_rcu_cpu *krc = container_of(work, struct kfree_rcu_cpu,
> + monitor_work.work);
> +
> + /* It is time to do bulk reclaim after grace period */
Except that we really aren't doing kfree_bulk() yet because we are on
the wrong side of the grace period, which means that we are not going
to kfree() anything yet. The actual kfree()ing happens after the grace
period. (Yes, I see the other interpretation, but it would be good to
make it unambiguous.)
> + spin_lock_irqsave(&krc->lock, flags);
> + todo = krc->monitor_todo;
> + krc->monitor_todo = false;
> + if (todo)
> + kfree_rcu_drain_unlock(krc, flags);
> + else
> + spin_unlock_irqrestore(&krc->lock, flags);
> +}
> +
> +static void kfree_rcu_batch(struct rcu_head *head, rcu_callback_t func)
> +{
> + unsigned long flags;
> + struct kfree_rcu_cpu *this_krc;
> + bool monitor_todo;
> +
> + local_irq_save(flags);
> + this_krc = this_cpu_ptr(&krc);
> +
> + spin_lock(&this_krc->lock);
> +
> + /* Queue the kfree but don't yet schedule the batch */
> + head->func = func;
> + head->next = this_krc->head;
> + this_krc->head = head;
> + this_krc->kfree_batch_len++;
> +
> + if (this_krc->kfree_batch_len == KFREE_MAX_BATCH) {
> + kfree_rcu_drain_unlock(this_krc, flags);
> + return;
> + }
> +
> + /* Maximum has not been reached, schedule monitor for timely drain */
> + monitor_todo = this_krc->monitor_todo;
> + this_krc->monitor_todo = true;
> + spin_unlock(&this_krc->lock);
> +
> + if (!monitor_todo) {
> + schedule_delayed_work_on(smp_processor_id(),
> + &this_krc->monitor_work, KFREE_DRAIN_JIFFIES);
> + }
> + local_irq_restore(flags);
> +}
> +
> /*
> * Queue an RCU callback for lazy invocation after a grace period.
This should be expanded a bit. Workqueue context and all that.
> - * This will likely be later named something like "call_rcu_lazy()",
> - * but this change will require some way of tagging the lazy RCU
> - * callbacks in the list of pending callbacks. Until then, this
> - * function may only be called from __kfree_rcu().
> */
> void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> {
> - __call_rcu(head, func, -1, 1);
> + kfree_rcu_batch(head, func);
Can't we just put the contents of kfree_rcu_batch() here and avoid the
extra level of function call?
> }
> EXPORT_SYMBOL_GPL(kfree_call_rcu);
>
> +/*
> + * The version of kfree_call_rcu that does not do batching of kfree_rcu()
> + * requests. To be used only for performance testing comparisons with
> + * kfree_rcu_batch().
> + */
> +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
> +{
> + __call_rcu(head, func, -1, 1);
> +}
You need an EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch) here to allow
rcuperf to be used via modprobe/rmmod. Normally 0day test robot would
have complained about this.
> +
> /*
> * During early boot, any blocking grace-period wait automatically
> * implies a grace period. Later on, this is never the case for PREEMPT.
> @@ -3452,6 +3627,17 @@ static void __init rcu_dump_rcu_node_tree(void)
> pr_cont("\n");
> }
>
> +void kfree_rcu_batch_init(void)
> +{
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> +
> + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> + }
> +}
> +
> struct workqueue_struct *rcu_gp_wq;
> struct workqueue_struct *rcu_par_gp_wq;
>
> @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> {
> int cpu;
>
> + kfree_rcu_batch_init();
What happens if someone does a kfree_rcu() before this point? It looks
like it should work, but have you tested it?
> rcu_early_boot_tests();
For example, by testing it in rcu_early_boot_tests() and moving the
call to kfree_rcu_batch_init() here.
> rcu_bootup_announce();
> --
> 2.22.0.770.g0f2c4a37fd-goog
>
On Tue, Aug 06, 2019 at 05:20:41PM -0400, Joel Fernandes (Google) wrote:
> This test runs kfree_rcu in a loop to measure performance of the new
> kfree_rcu, with and without patch.
>
> To see improvement, run with boot parameters:
> rcuperf.kfree_loops=2000 rcuperf.kfree_alloc_num=100 rcuperf.perf_type=kfree
>
> Without patch, test runs in 6.9 seconds.
> With patch, test runs in 6.1 seconds (+13% improvement)
>
> If it is desired to run the test but with the traditional (non-batched)
> kfree_rcu, for example to compare results, then you could pass along the
> rcuperf.kfree_no_batch=1 boot parameter.
You lost me on this one. You ran two runs, with rcuperf.kfree_no_batch=1
and without? Or you ran this patch both with and without the earlier
patch, and could have run with the patch and rcuperf.kfree_no_batch=1?
If the latter, it would be good to try all three.
> Cc: [email protected]
> Cc: [email protected]
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
More comments below.
Thanx, Paul
> ---
> kernel/rcu/rcuperf.c | 169 ++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 168 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
> index 7a6890b23c5f..34658760da5e 100644
> --- a/kernel/rcu/rcuperf.c
> +++ b/kernel/rcu/rcuperf.c
> @@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable
>
> static char *perf_type = "rcu";
> module_param(perf_type, charp, 0444);
> -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)");
> +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, kfree,...)");
>
> static int nrealreaders;
> static int nrealwriters;
> @@ -592,6 +592,170 @@ rcu_perf_shutdown(void *arg)
> return -EINVAL;
> }
>
> +/*
> + * kfree_rcu performance tests: Start a kfree_rcu loop on all CPUs for number
> + * of iterations and measure total time for all iterations to complete.
> + */
> +
> +torture_param(int, kfree_nthreads, -1, "Number of RCU reader threads");
> +torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done by a thread");
> +torture_param(int, kfree_alloc_size, 16, "Size of each allocation");
Is this used? How does it relate to KFREE_OBJ_BYTES?
> +torture_param(int, kfree_loops, 10, "Size of each allocation");
I suspect that this kfree_loops string is out of date.
> +torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu");
All of these need to be added to kernel-parameters.txt. Along with
any added by the earlier patch, for that matter.
> +static struct task_struct **kfree_reader_tasks;
> +static int kfree_nrealthreads;
> +static atomic_t n_kfree_perf_thread_started;
> +static atomic_t n_kfree_perf_thread_ended;
> +
> +#define KFREE_OBJ_BYTES 8
> +
> +struct kfree_obj {
> + char kfree_obj[KFREE_OBJ_BYTES];
> + struct rcu_head rh;
> +};
> +
> +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func);
> +
> +static int
> +kfree_perf_thread(void *arg)
> +{
> + int i, l = 0;
It is really easy to confuse "l" and "1" in some fonts, so please use
a different name. (From the "showing my age" department: On typical
1970s typewriters, there was no numeral "1" -- you typed the letter
"l" instead, thus anticipating at least the first digit of "1337".)
> + long me = (long)arg;
> + struct kfree_obj **alloc_ptrs;
> + u64 start_time, end_time;
> +
> + VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
> + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
> + set_user_nice(current, MAX_NICE);
> + atomic_inc(&n_kfree_perf_thread_started);
> +
> + alloc_ptrs = (struct kfree_obj **)kmalloc(sizeof(struct kfree_obj *) * kfree_alloc_num,
> + GFP_KERNEL);
> + if (!alloc_ptrs)
> + return -ENOMEM;
> +
> + start_time = ktime_get_mono_fast_ns();
Don't you want to announce that you started here rather than above in
order to avoid (admittedly slight) measurement inaccuracies?
> + do {
> + for (i = 0; i < kfree_alloc_num; i++) {
> + alloc_ptrs[i] = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
> + if (!alloc_ptrs[i])
> + return -ENOMEM;
> + }
> +
> + for (i = 0; i < kfree_alloc_num; i++) {
> + if (!kfree_no_batch) {
> + kfree_rcu(alloc_ptrs[i], rh);
> + } else {
> + rcu_callback_t cb;
> +
> + cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh);
> + kfree_call_rcu_nobatch(&(alloc_ptrs[i]->rh), cb);
> + }
> + }
> +
> + schedule_timeout_uninterruptible(2);
Why the two-jiffy wait in the middle of a timed test? Yes, you need
a cond_resched() and maybe more here, but a two-jiffy wait? I don't
see how this has any chance of getting valid measurements.
What am I missing here?
> + } while (!torture_must_stop() && ++l < kfree_loops);
> +
> + kfree(alloc_ptrs);
> +
> + if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) {
> + end_time = ktime_get_mono_fast_ns();
Don't we want to capture the end time before the kfree()?
> + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d\n",
> + (unsigned long long)(end_time - start_time), kfree_loops);
> + if (shutdown) {
> + smp_mb(); /* Assign before wake. */
> + wake_up(&shutdown_wq);
> + }
> + }
> +
> + torture_kthread_stopping("kfree_perf_thread");
> + return 0;
> +}
> +
> +static void
> +kfree_perf_cleanup(void)
> +{
> + int i;
> +
> + if (torture_cleanup_begin())
> + return;
> +
> + if (kfree_reader_tasks) {
> + for (i = 0; i < kfree_nrealthreads; i++)
> + torture_stop_kthread(kfree_perf_thread,
> + kfree_reader_tasks[i]);
> + kfree(kfree_reader_tasks);
> + }
> +
> + torture_cleanup_end();
> +}
> +
> +/*
> + * shutdown kthread. Just waits to be awakened, then shuts down system.
> + */
> +static int
> +kfree_perf_shutdown(void *arg)
> +{
> + do {
> + wait_event(shutdown_wq,
> + atomic_read(&n_kfree_perf_thread_ended) >=
> + kfree_nrealthreads);
> + } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
> +
> + smp_mb(); /* Wake before output. */
> +
> + kfree_perf_cleanup();
> + kernel_power_off();
> + return -EINVAL;
> +}
Is there some way to avoid (almost) duplicating rcu_perf_shutdown()?
> +static int __init
> +kfree_perf_init(void)
> +{
> + long i;
> + int firsterr = 0;
> +
> + if (!torture_init_begin("kfree_perf", verbose))
> + return -EBUSY;
> +
> + kfree_nrealthreads = compute_real(kfree_nthreads);
> + /* Start up the kthreads. */
> + if (shutdown) {
> + init_waitqueue_head(&shutdown_wq);
> + firsterr = torture_create_kthread(kfree_perf_shutdown, NULL,
> + shutdown_task);
> + if (firsterr)
> + goto unwind;
> + schedule_timeout_uninterruptible(1);
> + }
> +
> + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
> + GFP_KERNEL);
> + if (kfree_reader_tasks == NULL) {
> + firsterr = -ENOMEM;
> + goto unwind;
> + }
> +
> + for (i = 0; i < kfree_nrealthreads; i++) {
> + firsterr = torture_create_kthread(kfree_perf_thread, (void *)i,
> + kfree_reader_tasks[i]);
> + if (firsterr)
> + goto unwind;
> + }
> +
> + while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads)
> + schedule_timeout_uninterruptible(1);
> +
> + torture_init_end();
> + return 0;
> +
> +unwind:
> + torture_init_end();
> + kfree_perf_cleanup();
> + return firsterr;
> +}
> +
> static int __init
> rcu_perf_init(void)
> {
> @@ -601,6 +765,9 @@ rcu_perf_init(void)
> &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops,
> };
>
> + if (strcmp(perf_type, "kfree") == 0)
> + return kfree_perf_init();
> +
> if (!torture_init_begin(perf_type, verbose))
> return -EBUSY;
>
> --
> 2.22.0.770.g0f2c4a37fd-goog
>
On Tue, Aug 06, 2019 at 04:56:31PM -0700, Paul E. McKenney wrote:
> On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > Recently a discussion about performance of system involving a high rate
> > of kfree_rcu() calls surfaced on the list [1] which led to another
> > discussion how to prepare for this situation.
> >
> > This patch adds basic batching support for kfree_rcu. It is "basic"
> > because we do none of the slab management, dynamic allocation, code
> > moving or any of the other things, some of which previous attempts did
> > [2]. These fancier improvements can be follow-up patches and there are
> > several ideas being experimented in those regards.
> >
> > Torture tests follow in the next patch and show improvements of around
> > ~13% with continuous flooding of kfree_rcu() calls on a 16 CPU system.
> >
> > [1] http://lore.kernel.org/lkml/[email protected]
> > [2] https://lkml.org/lkml/2017/12/19/824
> >
> > This is an effort just to start simple, and build up from there.
> >
> > Cc: Rao Shoaib <[email protected]>
> > Cc: [email protected]
> > Cc: [email protected]
> > Cc: [email protected]
> > Cc: [email protected]
> > Co-developed-by: Byungchul Park <[email protected]>
> > Signed-off-by: Byungchul Park <[email protected]>
> > Signed-off-by: Joel Fernandes (Google) <[email protected]>
>
> Looks like a great start! Some comments and questions interspersed.
> There are a number of fixes required for this to be ready for mainline,
> but that is to be expected for v1.
Sure, thanks. Replied below:
> > kernel/rcu/tree.c | 198 ++++++++++++++++++++++++++++++++++++++++++++--
> > 1 file changed, 193 insertions(+), 5 deletions(-)
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index a14e5fbbea46..bdbd483606ce 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -2593,19 +2593,194 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
> > }
> > EXPORT_SYMBOL_GPL(call_rcu);
> >
> > +
> > +/* Maximum number of jiffies to wait before draining batch */
> > +#define KFREE_DRAIN_JIFFIES 50
> > +
> > +/*
> > + * Maximum number of kfree(s) to batch, if limit is hit
> > + * then RCU work is queued right away
> > + */
> > +#define KFREE_MAX_BATCH 200000ULL
> > +
> > +struct kfree_rcu_cpu {
> > + /* The work done to free objects after GP */
> > + struct rcu_work rcu_work;
>
> If I didn't already know what a struct rcu_work was, the comment would
> completely confuse me. The key point is that an rcu_work structure
> is to queue_kfree_rcu_work() as rcu_head is to call_rcu().
Will add comment.
> All the header comments need to be complete sentences as well, including
> the period at the end of the sentence.
> confusing, but all the more reason to make it easier on people reading
> your code. ;-)
Sure, will fix.
> > + /* The list of objects being queued */
> > + struct rcu_head *head;
>
> So these are the ones that have been queued, but are not yet waiting
> for a grace period, correct?
Yes, will improve comment.
> > + int kfree_batch_len;
>
> This length applies only to the callbacks in ->head, and not to those
> in ->head_free, yes?
Yes, will improve comment.
> > + /* The list of objects pending a free */
> > + struct rcu_head *head_free;
>
> And these are the ones waiting for a grace period, right?
Yes, will improve comment.
> > + /* Protect concurrent access to this structure */
> > + spinlock_t lock;
> > +
> > + /* The work done to monitor whether objects need free */
> > + struct delayed_work monitor_work;
>
> This would be the callbacks in ->head, not those in ->head_free, right?
This is just the delayed_work for making sure that kfree eventually happens.
At this point there can be callbacks queued in ->head, and also possibly
->head_free if the previous batch limit was reached.
> > + bool monitor_todo;
> > +};
>
> Either way, the comments need a bit of work.
Sure, will do.
> > +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
> > +
> > +/* Free all heads after a grace period (worker function) */
>
> Perhaps something like "This function is invoked in workqueue context
> after a grace period. It frees all the objects queued on ->head_free."
>
> The current "Frees all heads" is asking for confusion between ->head
> and ->head_free.
Sure, will fix, I agree.
> > +static void kfree_rcu_work(struct work_struct *work)
> > +{
> > + unsigned long flags;
> > + struct rcu_head *head, *next;
> > + struct kfree_rcu_cpu *krc = container_of(to_rcu_work(work),
> > + struct kfree_rcu_cpu, rcu_work);
> > +
> > + spin_lock_irqsave(&krc->lock, flags);
> > + head = krc->head_free;
> > + krc->head_free = NULL;
> > + spin_unlock_irqrestore(&krc->lock, flags);
> > +
> > + /* The head must be detached and not referenced from anywhere */
> > + for (; head; head = next) {
> > + next = head->next;
> > + head->next = NULL;
> > + /* Could be possible to optimize with kfree_bulk in future */
> > + __rcu_reclaim(rcu_state.name, head);
>
> We need at least a cond_resched() here. If earlier experience of
> a series of kfree() calls taking two milliseconds each holds up, a
> cond_resched_tasks_rcu_qs() would also be necessary.
>
> The two-milliseconds experience was with more than one hundred CPUs
> concurrently doing kfree(), in case you were wondering why you hadn't
> seen this in single-threaded operation.
I see. Ok, I will add cond_resched_tasks_rcu_qs().
> Of course, I am hoping that a later patch uses an array of pointers built
> at kfree_rcu() time, similar to Rao's patch (with or without kfree_bulk)
> in order to reduce per-object cache-miss overhead. This would make it
> easier for callback invocation to keep up with multi-CPU kfree_rcu()
> floods.
I think Byungchul tried an experiment with array of pointers and wasn't
immediately able to see a benefit. Perhaps his patch needs a bit more polish
or another test-case needed to show benefit due to cache-misses, and the perf
tool could be used to show if cache misses were reduced. For this initial
pass, we decided to keep it without the array optimization.
Would you or Rao have any suggestions for a test-case that can show benefit
with array-of-pointers?
> > + * Schedule the kfree batch RCU work to run after GP.
>
> Better! Still, please add the workqueue part, so that it is something
> like "Schedule the kfree batch RCU work to run in workqueue context
> after a GP."
Ok, will be more specific.
> > + *
> > + * Either the batch reached its maximum size, or the monitor's
> > + * time reached, either way schedule the batch work.
>
> Perhaps something like "This function is invoked when the batch
> has reached size KFREE_MAX_BATCH or when kfree_rcu_monitor() sees
> that the KFREE_DRAIN_JIFFIES timeout has been reached."
Ok, will fix.
> > +static bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krc)
>
> "krcp" would be more consistent with other RCU pointer naming. It would
> also avoid the "this_" prefix below, whose purpose appears to be to
> avoid confusion between the "krc" per-CPU variable and the "krc" pointer
> currently in use.
Sure, makes sense, will do!
> > + lockdep_assert_held(&krc->lock);
> > +
> > + /*
> > + * Someone already drained, probably before the monitor's worker
> > + * thread ran. Just return to avoid useless work.
> > + */
> > + if (!krc->head)
> > + return true;
>
> Given that we held the lock across the earlier enqueue of a callback
> onto ->head, how can ->head be NULL here? What am I missing?
This was added for the following events:
1. kfree_rcu_batch() runs a few times due to which a few kfree_rcu requests
are queued.
2. kfree_rcu_batch() also schedules the monitor thread which eventually
results in kfree_rcu_monitor() being called in the future.
3. Now a a large number of kfree_rcu() requests come in, and we are forced to
do the 'draining' earlier than usual. We do so, and head becomes NULL.
4. kfree_rcu_monitor() gets called now due to step 2. which calls
free_rcu_drain_unlock() which calls queue_kfree_rcu_work(). Here we see that
the 'draining' from step 3 already happened, so there's nothing to do.
Actually, since step 3 is already setting monitor_todo to false now, I think
this test is redundant and I can just remove it. Thanks for pointing this
out.
> > + * If RCU batch work already in progress, we cannot
> > + * queue another one, just refuse the optimization.
> > + */
> > + if (krc->head_free)
> > + return false;
> > +
> > + krc->head_free = krc->head;
> > + krc->head = NULL;
> > + krc->kfree_batch_len = 0;
> > + INIT_RCU_WORK(&krc->rcu_work, kfree_rcu_work);
> > + queue_rcu_work(system_wq, &krc->rcu_work);
> > +
> > + return true;
> > +}
> > +
> > +static void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krc,
> > + unsigned long flags)
> > +{
> > + struct rcu_head *head, *next;
> > +
> > + /* It is time to do bulk reclaim after grace period */
> > + krc->monitor_todo = false;
> > + if (queue_kfree_rcu_work(krc)) {
> > + spin_unlock_irqrestore(&krc->lock, flags);
> > + return;
> > + }
> > +
> > + /*
> > + * Use non-batch regular call_rcu for kfree_rcu in case things are too
> > + * busy and batching of kfree_rcu could not be used.
> > + */
> > + head = krc->head;
> > + krc->head = NULL;
> > + krc->kfree_batch_len = 0;
> > + spin_unlock_irqrestore(&krc->lock, flags);
> > +
> > + for (; head; head = next) {
> > + next = head->next;
> > + head->next = NULL;
> > + __call_rcu(head, head->func, -1, 1);
>
> We need at least a cond_resched() here. 200,000 times through this loop
> in a PREEMPT=n kernel might not always be pretty. Except that this is
> invoked directly from kfree_rcu() which might be invoked with interrupts
> disabled, which precludes calls to cond_resched(). So the realtime guys
> are not going to be at all happy with this loop.
Ok, will add this here.
> And this loop could be avoided entirely by having a third rcu_head list
> in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> should be OK, or at least more OK than queuing 200,000 callbacks with
> interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> pointers can be used to reduce the probability of oversized batches.)
> This would also mean that the equality comparisons with KFREE_MAX_BATCH
> need to become greater-or-equal comparisons or some such.
Yes, certainly we can do these kinds of improvements after this patch, and
then add more tests to validate the improvements.
> > + * If enough time has passed, the kfree batch has to be drained
> > + * and the monitor takes care of that.
>
> Perhaps something like: "This function is invoked after the
> KFREE_DRAIN_JIFFIES timeout has elapse, so it drains the specified
> kfree_rcu_cpu structure's ->head list."
Ok, sure.
> > + */
> > +static void kfree_rcu_monitor(struct work_struct *work)
> > +{
> > + bool todo;
> > + unsigned long flags;
> > + struct kfree_rcu_cpu *krc = container_of(work, struct kfree_rcu_cpu,
> > + monitor_work.work);
> > +
> > + /* It is time to do bulk reclaim after grace period */
>
> Except that we really aren't doing kfree_bulk() yet because we are on
> the wrong side of the grace period, which means that we are not going
> to kfree() anything yet. The actual kfree()ing happens after the grace
> period. (Yes, I see the other interpretation, but it would be good to
> make it unambiguous.)
Right, I will improve the comment.
> > + spin_lock_irqsave(&krc->lock, flags);
> > + todo = krc->monitor_todo;
> > + krc->monitor_todo = false;
> > + if (todo)
> > + kfree_rcu_drain_unlock(krc, flags);
> > + else
> > + spin_unlock_irqrestore(&krc->lock, flags);
> > +}
> > +
> > +static void kfree_rcu_batch(struct rcu_head *head, rcu_callback_t func)
> > +{
> > + unsigned long flags;
> > + struct kfree_rcu_cpu *this_krc;
> > + bool monitor_todo;
> > +
> > + local_irq_save(flags);
> > + this_krc = this_cpu_ptr(&krc);
> > +
> > + spin_lock(&this_krc->lock);
> > +
> > + /* Queue the kfree but don't yet schedule the batch */
> > + head->func = func;
> > + head->next = this_krc->head;
> > + this_krc->head = head;
> > + this_krc->kfree_batch_len++;
> > +
> > + if (this_krc->kfree_batch_len == KFREE_MAX_BATCH) {
> > + kfree_rcu_drain_unlock(this_krc, flags);
> > + return;
> > + }
> > +
> > + /* Maximum has not been reached, schedule monitor for timely drain */
> > + monitor_todo = this_krc->monitor_todo;
> > + this_krc->monitor_todo = true;
> > + spin_unlock(&this_krc->lock);
> > +
> > + if (!monitor_todo) {
> > + schedule_delayed_work_on(smp_processor_id(),
> > + &this_krc->monitor_work, KFREE_DRAIN_JIFFIES);
> > + }
> > + local_irq_restore(flags);
> > +}
> > +
> > /*
> > * Queue an RCU callback for lazy invocation after a grace period.
>
> This should be expanded a bit. Workqueue context and all that.
Ok, sure, will be more clear.
> > - * This will likely be later named something like "call_rcu_lazy()",
> > - * but this change will require some way of tagging the lazy RCU
> > - * callbacks in the list of pending callbacks. Until then, this
> > - * function may only be called from __kfree_rcu().
> > */
> > void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> > {
> > - __call_rcu(head, func, -1, 1);
> > + kfree_rcu_batch(head, func);
>
> Can't we just put the contents of kfree_rcu_batch() here and avoid the
> extra level of function call?
Sure ok, that makes sense, we can do that.
> > }
> > EXPORT_SYMBOL_GPL(kfree_call_rcu);
> >
> > +/*
> > + * The version of kfree_call_rcu that does not do batching of kfree_rcu()
> > + * requests. To be used only for performance testing comparisons with
> > + * kfree_rcu_batch().
> > + */
> > +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
> > +{
> > + __call_rcu(head, func, -1, 1);
> > +}
>
> You need an EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch) here to allow
> rcuperf to be used via modprobe/rmmod. Normally 0day test robot would
> have complained about this.
Ok, will fix.
> > /*
> > * During early boot, any blocking grace-period wait automatically
> > * implies a grace period. Later on, this is never the case for PREEMPT.
> > @@ -3452,6 +3627,17 @@ static void __init rcu_dump_rcu_node_tree(void)
> > pr_cont("\n");
> > }
> >
> > +void kfree_rcu_batch_init(void)
> > +{
> > + int cpu;
> > +
> > + for_each_possible_cpu(cpu) {
> > + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > +
> > + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> > + }
> > +}
> > +
> > struct workqueue_struct *rcu_gp_wq;
> > struct workqueue_struct *rcu_par_gp_wq;
> >
> > @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> > {
> > int cpu;
> >
> > + kfree_rcu_batch_init();
>
> What happens if someone does a kfree_rcu() before this point? It looks
> like it should work, but have you tested it?
>
> > rcu_early_boot_tests();
>
> For example, by testing it in rcu_early_boot_tests() and moving the
> call to kfree_rcu_batch_init() here.
I have not tried to do the kfree_rcu() this early. I will try it out.
thanks!
- Joel
On Tue, Aug 06, 2019 at 05:29:15PM -0700, Paul E. McKenney wrote:
> On Tue, Aug 06, 2019 at 05:20:41PM -0400, Joel Fernandes (Google) wrote:
> > This test runs kfree_rcu in a loop to measure performance of the new
> > kfree_rcu, with and without patch.
> >
> > To see improvement, run with boot parameters:
> > rcuperf.kfree_loops=2000 rcuperf.kfree_alloc_num=100 rcuperf.perf_type=kfree
> >
> > Without patch, test runs in 6.9 seconds.
> > With patch, test runs in 6.1 seconds (+13% improvement)
> >
> > If it is desired to run the test but with the traditional (non-batched)
> > kfree_rcu, for example to compare results, then you could pass along the
> > rcuperf.kfree_no_batch=1 boot parameter.
>
> You lost me on this one. You ran two runs, with rcuperf.kfree_no_batch=1
> and without? Or you ran this patch both with and without the earlier
> patch, and could have run with the patch and rcuperf.kfree_no_batch=1?
I always run the rcutorture test with patch because the patch doesn't really
do anything if rcuperf.kfree_no_batch=0. This parameter is added so that in
the future folks can compare effect of non-batching with that of the
batching. However, I can also remove the patch itself and run this test
again.
> If the latter, it would be good to try all three.
Ok, sure.
[snip]
> > ---
> > kernel/rcu/rcuperf.c | 169 ++++++++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 168 insertions(+), 1 deletion(-)
> >
> > diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
> > index 7a6890b23c5f..34658760da5e 100644
> > --- a/kernel/rcu/rcuperf.c
> > +++ b/kernel/rcu/rcuperf.c
> > @@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable
> >
> > static char *perf_type = "rcu";
> > module_param(perf_type, charp, 0444);
> > -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)");
> > +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, kfree,...)");
> >
> > static int nrealreaders;
> > static int nrealwriters;
> > @@ -592,6 +592,170 @@ rcu_perf_shutdown(void *arg)
> > return -EINVAL;
> > }
> >
> > +/*
> > + * kfree_rcu performance tests: Start a kfree_rcu loop on all CPUs for number
> > + * of iterations and measure total time for all iterations to complete.
> > + */
> > +
> > +torture_param(int, kfree_nthreads, -1, "Number of RCU reader threads");
> > +torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done by a thread");
> > +torture_param(int, kfree_alloc_size, 16, "Size of each allocation");
>
> Is this used? How does it relate to KFREE_OBJ_BYTES?
You're right, I had added this before but it is unused now. Sorry about that,
I will remove it.
> > +torture_param(int, kfree_loops, 10, "Size of each allocation");
>
> I suspect that this kfree_loops string is out of date.
Yes, complete screw up, will update it.
> > +torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu");
>
> All of these need to be added to kernel-parameters.txt. Along with
> any added by the earlier patch, for that matter.
Sure, should I split that into a separate patch?
> > +static struct task_struct **kfree_reader_tasks;
> > +static int kfree_nrealthreads;
> > +static atomic_t n_kfree_perf_thread_started;
> > +static atomic_t n_kfree_perf_thread_ended;
> > +
> > +#define KFREE_OBJ_BYTES 8
> > +
> > +struct kfree_obj {
> > + char kfree_obj[KFREE_OBJ_BYTES];
> > + struct rcu_head rh;
> > +};
> > +
> > +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func);
> > +
> > +static int
> > +kfree_perf_thread(void *arg)
> > +{
> > + int i, l = 0;
>
> It is really easy to confuse "l" and "1" in some fonts, so please use
> a different name. (From the "showing my age" department: On typical
> 1970s typewriters, there was no numeral "1" -- you typed the letter
> "l" instead, thus anticipating at least the first digit of "1337".)
:-D Ok, I will improve the names.
> > + long me = (long)arg;
> > + struct kfree_obj **alloc_ptrs;
> > + u64 start_time, end_time;
> > +
> > + VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
> > + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
> > + set_user_nice(current, MAX_NICE);
> > + atomic_inc(&n_kfree_perf_thread_started);
> > +
> > + alloc_ptrs = (struct kfree_obj **)kmalloc(sizeof(struct kfree_obj *) * kfree_alloc_num,
> > + GFP_KERNEL);
> > + if (!alloc_ptrs)
> > + return -ENOMEM;
> > +
> > + start_time = ktime_get_mono_fast_ns();
>
> Don't you want to announce that you started here rather than above in
> order to avoid (admittedly slight) measurement inaccuracies?
I did not follow, are you referring to the measurement inaccuracy related to
the "kfree_perf_thread task started" string print? Or, are you saying that
ktime_get_mono_fast_ns() has to start earlier than over here?
> > + do {
> > + for (i = 0; i < kfree_alloc_num; i++) {
> > + alloc_ptrs[i] = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
> > + if (!alloc_ptrs[i])
> > + return -ENOMEM;
> > + }
> > +
> > + for (i = 0; i < kfree_alloc_num; i++) {
> > + if (!kfree_no_batch) {
> > + kfree_rcu(alloc_ptrs[i], rh);
> > + } else {
> > + rcu_callback_t cb;
> > +
> > + cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh);
> > + kfree_call_rcu_nobatch(&(alloc_ptrs[i]->rh), cb);
> > + }
> > + }
> > +
> > + schedule_timeout_uninterruptible(2);
>
> Why the two-jiffy wait in the middle of a timed test? Yes, you need
> a cond_resched() and maybe more here, but a two-jiffy wait? I don't
> see how this has any chance of getting valid measurements.
>
> What am I missing here?
I am getting pretty reliable and repeatable results with this test. The sleep
was mostly just to give the system a chance to scheduler other tasks. I can
remove the schedule and also try with just cond_resched().
The other reason for the schedule call was also to give the test a longer
running time and help with easier measurement as a result, since the test
would run otherwise for a very shortwhile. Agreed there might be a better way
to handle this issue.
(I will reply to the rest of the comments below in a bit, I am going to a
hospital now to visit a sick relative and will be back a bit later.)
thanks!
- Joel
On Wed, Aug 07, 2019 at 05:45:04AM -0400, Joel Fernandes wrote:
> On Tue, Aug 06, 2019 at 04:56:31PM -0700, Paul E. McKenney wrote:
> > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
[ . . . ]
> > > + for (; head; head = next) {
> > > + next = head->next;
> > > + head->next = NULL;
> > > + __call_rcu(head, head->func, -1, 1);
> >
> > We need at least a cond_resched() here. 200,000 times through this loop
> > in a PREEMPT=n kernel might not always be pretty. Except that this is
> > invoked directly from kfree_rcu() which might be invoked with interrupts
> > disabled, which precludes calls to cond_resched(). So the realtime guys
> > are not going to be at all happy with this loop.
>
> Ok, will add this here.
>
> > And this loop could be avoided entirely by having a third rcu_head list
> > in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> > KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> > should be OK, or at least more OK than queuing 200,000 callbacks with
> > interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> > pointers can be used to reduce the probability of oversized batches.)
> > This would also mean that the equality comparisons with KFREE_MAX_BATCH
> > need to become greater-or-equal comparisons or some such.
>
> Yes, certainly we can do these kinds of improvements after this patch, and
> then add more tests to validate the improvements.
Out of pity for people bisecting, we need this fixed up front.
My suggestion is to just allow ->head to grow until ->head_free becomes
available. That way you are looping with interrupts and preemption
enabled in workqueue context, which is much less damaging than doing so
with interrupts disabled, and possibly even from hard-irq context.
But please feel free to come up with a better solution!
[ . . . ]
> > > @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> > > {
> > > int cpu;
> > >
> > > + kfree_rcu_batch_init();
> >
> > What happens if someone does a kfree_rcu() before this point? It looks
> > like it should work, but have you tested it?
> >
> > > rcu_early_boot_tests();
> >
> > For example, by testing it in rcu_early_boot_tests() and moving the
> > call to kfree_rcu_batch_init() here.
>
> I have not tried to do the kfree_rcu() this early. I will try it out.
Yeah, well, call_rcu() this early came as a surprise to me back in the
day, so... ;-)
Thanx, Paul
On Wed, Aug 07, 2019 at 06:22:13AM -0400, Joel Fernandes wrote:
> On Tue, Aug 06, 2019 at 05:29:15PM -0700, Paul E. McKenney wrote:
> > On Tue, Aug 06, 2019 at 05:20:41PM -0400, Joel Fernandes (Google) wrote:
> > > This test runs kfree_rcu in a loop to measure performance of the new
> > > kfree_rcu, with and without patch.
> > >
> > > To see improvement, run with boot parameters:
> > > rcuperf.kfree_loops=2000 rcuperf.kfree_alloc_num=100 rcuperf.perf_type=kfree
> > >
> > > Without patch, test runs in 6.9 seconds.
> > > With patch, test runs in 6.1 seconds (+13% improvement)
> > >
> > > If it is desired to run the test but with the traditional (non-batched)
> > > kfree_rcu, for example to compare results, then you could pass along the
> > > rcuperf.kfree_no_batch=1 boot parameter.
> >
> > You lost me on this one. You ran two runs, with rcuperf.kfree_no_batch=1
> > and without? Or you ran this patch both with and without the earlier
> > patch, and could have run with the patch and rcuperf.kfree_no_batch=1?
>
> I always run the rcutorture test with patch because the patch doesn't really
> do anything if rcuperf.kfree_no_batch=0. This parameter is added so that in
> the future folks can compare effect of non-batching with that of the
> batching. However, I can also remove the patch itself and run this test
> again.
>
> > If the latter, it would be good to try all three.
>
> Ok, sure.
Very good! And please make the commit log more clear. ;-)
> [snip]
> > > ---
> > > kernel/rcu/rcuperf.c | 169 ++++++++++++++++++++++++++++++++++++++++++-
> > > 1 file changed, 168 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
> > > index 7a6890b23c5f..34658760da5e 100644
> > > --- a/kernel/rcu/rcuperf.c
> > > +++ b/kernel/rcu/rcuperf.c
> > > @@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable
> > >
> > > static char *perf_type = "rcu";
> > > module_param(perf_type, charp, 0444);
> > > -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)");
> > > +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, kfree,...)");
> > >
> > > static int nrealreaders;
> > > static int nrealwriters;
> > > @@ -592,6 +592,170 @@ rcu_perf_shutdown(void *arg)
> > > return -EINVAL;
> > > }
> > >
> > > +/*
> > > + * kfree_rcu performance tests: Start a kfree_rcu loop on all CPUs for number
> > > + * of iterations and measure total time for all iterations to complete.
> > > + */
> > > +
> > > +torture_param(int, kfree_nthreads, -1, "Number of RCU reader threads");
> > > +torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done by a thread");
> > > +torture_param(int, kfree_alloc_size, 16, "Size of each allocation");
> >
> > Is this used? How does it relate to KFREE_OBJ_BYTES?
>
> You're right, I had added this before but it is unused now. Sorry about that,
> I will remove it.
>
> > > +torture_param(int, kfree_loops, 10, "Size of each allocation");
> >
> > I suspect that this kfree_loops string is out of date.
>
> Yes, complete screw up, will update it.
>
> > > +torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu");
> >
> > All of these need to be added to kernel-parameters.txt. Along with
> > any added by the earlier patch, for that matter.
>
> Sure, should I split that into a separate patch?
Your choice.
> > > +static struct task_struct **kfree_reader_tasks;
> > > +static int kfree_nrealthreads;
> > > +static atomic_t n_kfree_perf_thread_started;
> > > +static atomic_t n_kfree_perf_thread_ended;
> > > +
> > > +#define KFREE_OBJ_BYTES 8
> > > +
> > > +struct kfree_obj {
> > > + char kfree_obj[KFREE_OBJ_BYTES];
> > > + struct rcu_head rh;
> > > +};
> > > +
> > > +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func);
> > > +
> > > +static int
> > > +kfree_perf_thread(void *arg)
> > > +{
> > > + int i, l = 0;
> >
> > It is really easy to confuse "l" and "1" in some fonts, so please use
> > a different name. (From the "showing my age" department: On typical
> > 1970s typewriters, there was no numeral "1" -- you typed the letter
> > "l" instead, thus anticipating at least the first digit of "1337".)
>
> :-D Ok, I will improve the names.
>
> > > + long me = (long)arg;
> > > + struct kfree_obj **alloc_ptrs;
> > > + u64 start_time, end_time;
> > > +
> > > + VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
> > > + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
> > > + set_user_nice(current, MAX_NICE);
> > > + atomic_inc(&n_kfree_perf_thread_started);
> > > +
> > > + alloc_ptrs = (struct kfree_obj **)kmalloc(sizeof(struct kfree_obj *) * kfree_alloc_num,
> > > + GFP_KERNEL);
> > > + if (!alloc_ptrs)
> > > + return -ENOMEM;
> > > +
> > > + start_time = ktime_get_mono_fast_ns();
> >
> > Don't you want to announce that you started here rather than above in
> > order to avoid (admittedly slight) measurement inaccuracies?
>
> I did not follow, are you referring to the measurement inaccuracy related to
> the "kfree_perf_thread task started" string print? Or, are you saying that
> ktime_get_mono_fast_ns() has to start earlier than over here?
I am referring to the atomic_inc().
> > > + do {
> > > + for (i = 0; i < kfree_alloc_num; i++) {
> > > + alloc_ptrs[i] = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
> > > + if (!alloc_ptrs[i])
> > > + return -ENOMEM;
> > > + }
> > > +
> > > + for (i = 0; i < kfree_alloc_num; i++) {
> > > + if (!kfree_no_batch) {
> > > + kfree_rcu(alloc_ptrs[i], rh);
> > > + } else {
> > > + rcu_callback_t cb;
> > > +
> > > + cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh);
> > > + kfree_call_rcu_nobatch(&(alloc_ptrs[i]->rh), cb);
> > > + }
> > > + }
> > > +
> > > + schedule_timeout_uninterruptible(2);
> >
> > Why the two-jiffy wait in the middle of a timed test? Yes, you need
> > a cond_resched() and maybe more here, but a two-jiffy wait? I don't
> > see how this has any chance of getting valid measurements.
> >
> > What am I missing here?
>
> I am getting pretty reliable and repeatable results with this test.
That is a good thing, but you might not be measuring what you think you
are measuring.
> The sleep
> was mostly just to give the system a chance to scheduler other tasks. I can
> remove the schedule and also try with just cond_resched().
Please do! This can be a bit fiddly, but there is example code in
current rcutorture on -rcu.
> The other reason for the schedule call was also to give the test a longer
> running time and help with easier measurement as a result, since the test
> would run otherwise for a very shortwhile. Agreed there might be a better way
> to handle this issue.
Easy! Do more kmalloc()/kfree_rcu() pairs! ;-)
> (I will reply to the rest of the comments below in a bit, I am going to a
> hospital now to visit a sick relative and will be back a bit later.)
Ouch!!! I hope that goes as well as it possibly can! And please don't
neglect your relative on RCU's account!!!
Thanx, Paul
On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> On Wed, Aug 07, 2019 at 05:45:04AM -0400, Joel Fernandes wrote:
> > On Tue, Aug 06, 2019 at 04:56:31PM -0700, Paul E. McKenney wrote:
> > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
>
> [ . . . ]
>
> > > > + for (; head; head = next) {
> > > > + next = head->next;
> > > > + head->next = NULL;
> > > > + __call_rcu(head, head->func, -1, 1);
> > >
> > > We need at least a cond_resched() here. 200,000 times through this loop
> > > in a PREEMPT=n kernel might not always be pretty. Except that this is
> > > invoked directly from kfree_rcu() which might be invoked with interrupts
> > > disabled, which precludes calls to cond_resched(). So the realtime guys
> > > are not going to be at all happy with this loop.
> >
> > Ok, will add this here.
> >
> > > And this loop could be avoided entirely by having a third rcu_head list
> > > in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> > > KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> > > should be OK, or at least more OK than queuing 200,000 callbacks with
> > > interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> > > pointers can be used to reduce the probability of oversized batches.)
> > > This would also mean that the equality comparisons with KFREE_MAX_BATCH
> > > need to become greater-or-equal comparisons or some such.
> >
> > Yes, certainly we can do these kinds of improvements after this patch, and
> > then add more tests to validate the improvements.
>
> Out of pity for people bisecting, we need this fixed up front.
>
> My suggestion is to just allow ->head to grow until ->head_free becomes
> available. That way you are looping with interrupts and preemption
> enabled in workqueue context, which is much less damaging than doing so
> with interrupts disabled, and possibly even from hard-irq context.
Agree.
Or after introducing another limit like KFREE_MAX_BATCH_FORCE(>=
KFREE_MAX_BATCH):
1. Try to drain it on hitting KFREE_MAX_BATCH as it does.
On success: Same as now.
On fail: let ->head grow and drain if possible, until reaching to
KFREE_MAX_BATCH_FORCE.
3. On hitting KFREE_MAX_BATCH_FORCE, give up batching but handle one by
one from now on to prevent too many pending requests from being
queued for batching work.
This way, we can avoid both:
1. too many requests being queued and
2. __call_rcu() bunch of requests within a single kfree_rcu().
Thanks,
Byungchul
>
> But please feel free to come up with a better solution!
>
> [ . . . ]
>
> > > > @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> > > > {
> > > > int cpu;
> > > >
> > > > + kfree_rcu_batch_init();
> > >
> > > What happens if someone does a kfree_rcu() before this point? It looks
> > > like it should work, but have you tested it?
> > >
> > > > rcu_early_boot_tests();
> > >
> > > For example, by testing it in rcu_early_boot_tests() and moving the
> > > call to kfree_rcu_batch_init() here.
> >
> > I have not tried to do the kfree_rcu() this early. I will try it out.
>
> Yeah, well, call_rcu() this early came as a surprise to me back in the
> day, so... ;-)
>
> Thanx, Paul
On Wed, Aug 07, 2019 at 05:45:04AM -0400, Joel Fernandes wrote:
> On Tue, Aug 06, 2019 at 04:56:31PM -0700, Paul E. McKenney wrote:
[snip]
> > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > Of course, I am hoping that a later patch uses an array of pointers built
> > at kfree_rcu() time, similar to Rao's patch (with or without kfree_bulk)
> > in order to reduce per-object cache-miss overhead. This would make it
> > easier for callback invocation to keep up with multi-CPU kfree_rcu()
> > floods.
>
> I think Byungchul tried an experiment with array of pointers and wasn't
> immediately able to see a benefit. Perhaps his patch needs a bit more polish
> or another test-case needed to show benefit due to cache-misses, and the perf
> tool could be used to show if cache misses were reduced. For this initial
> pass, we decided to keep it without the array optimization.
I'm still seeing no improvement with kfree_bulk().
I've been thinking I could see improvement with kfree_bulk() because:
1. As you guys said, the number of cache misses will be reduced.
2. We can save (N - 1) irq-disable instructions while N kfrees.
3. As Joel said, saving/restoring CPU status that kfree() does inside
is not required.
But even with the following patch applied, the result was same as just
batching test. We might need to get kmalloc objects from random
addresses to maximize the result when using kfree_bulk() and this is
even closer to real practical world too.
And the second and third reasons doesn't seem to work as much as I
expected.
Do you have any idea? Or what do you think about it?
Thanks,
Byungchul
-----8<-----
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 988e1ae..6f2ab06 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -651,10 +651,10 @@ struct kfree_obj {
return -ENOMEM;
}
- for (i = 0; i < kfree_alloc_num; i++) {
- if (!kfree_no_batch) {
- kfree_rcu(alloc_ptrs[i], rh);
- } else {
+ if (!kfree_no_batch) {
+ kfree_bulk(kfree_alloc_num, (void **)alloc_ptrs);
+ } else {
+ for (i = 0; i < kfree_alloc_num; i++) {
rcu_callback_t cb;
cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh);
On Thu, Aug 08, 2019 at 06:52:32PM +0900, Byungchul Park wrote:
> On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > [ . . . ]
> > > > > + for (; head; head = next) {
> > > > > + next = head->next;
> > > > > + head->next = NULL;
> > > > > + __call_rcu(head, head->func, -1, 1);
> > > >
> > > > We need at least a cond_resched() here. 200,000 times through this loop
> > > > in a PREEMPT=n kernel might not always be pretty. Except that this is
> > > > invoked directly from kfree_rcu() which might be invoked with interrupts
> > > > disabled, which precludes calls to cond_resched(). So the realtime guys
> > > > are not going to be at all happy with this loop.
> > >
> > > Ok, will add this here.
> > >
> > > > And this loop could be avoided entirely by having a third rcu_head list
> > > > in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> > > > KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> > > > should be OK, or at least more OK than queuing 200,000 callbacks with
> > > > interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> > > > pointers can be used to reduce the probability of oversized batches.)
> > > > This would also mean that the equality comparisons with KFREE_MAX_BATCH
> > > > need to become greater-or-equal comparisons or some such.
> > >
> > > Yes, certainly we can do these kinds of improvements after this patch, and
> > > then add more tests to validate the improvements.
> >
> > Out of pity for people bisecting, we need this fixed up front.
> >
> > My suggestion is to just allow ->head to grow until ->head_free becomes
> > available. That way you are looping with interrupts and preemption
> > enabled in workqueue context, which is much less damaging than doing so
> > with interrupts disabled, and possibly even from hard-irq context.
>
> Agree.
>
> Or after introducing another limit like KFREE_MAX_BATCH_FORCE(>=
> KFREE_MAX_BATCH):
>
> 1. Try to drain it on hitting KFREE_MAX_BATCH as it does.
>
> On success: Same as now.
> On fail: let ->head grow and drain if possible, until reaching to
> KFREE_MAX_BATCH_FORCE.
>
> 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching but handle one by
> one from now on to prevent too many pending requests from being
> queued for batching work.
I also agree. But this _FORCE thing will still not solve the issue Paul is
raising which is doing this loop possibly in irq disabled / hardirq context.
We can't even cond_resched() here. In fact since _FORCE is larger, it will be
even worse. Consider a real-time system with a lot of memory, in this case
letting ->head grow large is Ok, but looping for long time in IRQ disabled
would not be Ok.
But I could make it something like:
1. Letting ->head grow if ->head_free busy
2. If head_free is busy, then just queue/requeue the monitor to try again.
This would even improve performance, but will still risk going out of memory.
Thoughts?
thanks,
- Joel
>
> This way, we can avoid both:
>
> 1. too many requests being queued and
> 2. __call_rcu() bunch of requests within a single kfree_rcu().
>
> Thanks,
> Byungchul
>
> >
> > But please feel free to come up with a better solution!
> >
> > [ . . . ]
On Thu, Aug 8, 2019 at 9:56 PM Joel Fernandes <[email protected]> wrote:
>
> On Thu, Aug 08, 2019 at 06:52:32PM +0900, Byungchul Park wrote:
> > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > [ . . . ]
> > > > > > + for (; head; head = next) {
> > > > > > + next = head->next;
> > > > > > + head->next = NULL;
> > > > > > + __call_rcu(head, head->func, -1, 1);
> > > > >
> > > > > We need at least a cond_resched() here. 200,000 times through this loop
> > > > > in a PREEMPT=n kernel might not always be pretty. Except that this is
> > > > > invoked directly from kfree_rcu() which might be invoked with interrupts
> > > > > disabled, which precludes calls to cond_resched(). So the realtime guys
> > > > > are not going to be at all happy with this loop.
> > > >
> > > > Ok, will add this here.
> > > >
> > > > > And this loop could be avoided entirely by having a third rcu_head list
> > > > > in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> > > > > KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> > > > > should be OK, or at least more OK than queuing 200,000 callbacks with
> > > > > interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> > > > > pointers can be used to reduce the probability of oversized batches.)
> > > > > This would also mean that the equality comparisons with KFREE_MAX_BATCH
> > > > > need to become greater-or-equal comparisons or some such.
> > > >
> > > > Yes, certainly we can do these kinds of improvements after this patch, and
> > > > then add more tests to validate the improvements.
> > >
> > > Out of pity for people bisecting, we need this fixed up front.
> > >
> > > My suggestion is to just allow ->head to grow until ->head_free becomes
> > > available. That way you are looping with interrupts and preemption
> > > enabled in workqueue context, which is much less damaging than doing so
> > > with interrupts disabled, and possibly even from hard-irq context.
> >
> > Agree.
> >
> > Or after introducing another limit like KFREE_MAX_BATCH_FORCE(>=
> > KFREE_MAX_BATCH):
> >
> > 1. Try to drain it on hitting KFREE_MAX_BATCH as it does.
> >
> > On success: Same as now.
> > On fail: let ->head grow and drain if possible, until reaching to
> > KFREE_MAX_BATCH_FORCE.
I should've explain this in more detail. This actually mean:
On fail: Let ->head grow and queue rcu_work when ->head_free == NULL,
until reaching to _FORCE.
> > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching but handle one by
> > one from now on to prevent too many pending requests from being
> > queued for batching work.
This mean:
3. On hitting KFREE_MAX_BATCH_FORCE, give up batching requests to be added
from now on but instead handle one by one to prevent too many
pending requests
from being queued. Of course, the requests already having been
queued in ->head
so far should be handled by rcu_work when it's possible which can
be checked by
the monitor or kfree_rcu() inside every call.
> I also agree. But this _FORCE thing will still not solve the issue Paul is
> raising which is doing this loop possibly in irq disabled / hardirq context.
I added more explanation above. What I suggested is a way to avoid not
only heavy
work within the irq-disabled region of a single kfree_rcu() but also
too many requests
to be queued into ->head.
> We can't even cond_resched() here. In fact since _FORCE is larger, it will be
> even worse. Consider a real-time system with a lot of memory, in this case
> letting ->head grow large is Ok, but looping for long time in IRQ disabled
> would not be Ok.
Please check the explanation above.
> But I could make it something like:
> 1. Letting ->head grow if ->head_free busy
> 2. If head_free is busy, then just queue/requeue the monitor to try again.
This is exactly what Paul said. The problem with this is ->head can grow too
much. That's why I suggested the above one.
> This would even improve performance, but will still risk going out of memory.
>
> Thoughts?
>
> thanks,
>
> - Joel
>
> >
> > This way, we can avoid both:
> >
> > 1. too many requests being queued and
> > 2. __call_rcu() bunch of requests within a single kfree_rcu().
> >
> > Thanks,
> > Byungchul
> >
> > >
> > > But please feel free to come up with a better solution!
> > >
> > > [ . . . ]
--
Thanks,
Byungchul
On Thu, Aug 08, 2019 at 11:23:17PM +0900, Byungchul Park wrote:
> On Thu, Aug 8, 2019 at 9:56 PM Joel Fernandes <[email protected]> wrote:
> >
> > On Thu, Aug 08, 2019 at 06:52:32PM +0900, Byungchul Park wrote:
> > > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > > [ . . . ]
> > > > > > > + for (; head; head = next) {
> > > > > > > + next = head->next;
> > > > > > > + head->next = NULL;
> > > > > > > + __call_rcu(head, head->func, -1, 1);
> > > > > >
> > > > > > We need at least a cond_resched() here. 200,000 times through this loop
> > > > > > in a PREEMPT=n kernel might not always be pretty. Except that this is
> > > > > > invoked directly from kfree_rcu() which might be invoked with interrupts
> > > > > > disabled, which precludes calls to cond_resched(). So the realtime guys
> > > > > > are not going to be at all happy with this loop.
> > > > >
> > > > > Ok, will add this here.
> > > > >
> > > > > > And this loop could be avoided entirely by having a third rcu_head list
> > > > > > in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> > > > > > KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> > > > > > should be OK, or at least more OK than queuing 200,000 callbacks with
> > > > > > interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> > > > > > pointers can be used to reduce the probability of oversized batches.)
> > > > > > This would also mean that the equality comparisons with KFREE_MAX_BATCH
> > > > > > need to become greater-or-equal comparisons or some such.
> > > > >
> > > > > Yes, certainly we can do these kinds of improvements after this patch, and
> > > > > then add more tests to validate the improvements.
> > > >
> > > > Out of pity for people bisecting, we need this fixed up front.
> > > >
> > > > My suggestion is to just allow ->head to grow until ->head_free becomes
> > > > available. That way you are looping with interrupts and preemption
> > > > enabled in workqueue context, which is much less damaging than doing so
> > > > with interrupts disabled, and possibly even from hard-irq context.
> > >
> > > Agree.
> > >
> > > Or after introducing another limit like KFREE_MAX_BATCH_FORCE(>=
> > > KFREE_MAX_BATCH):
> > >
> > > 1. Try to drain it on hitting KFREE_MAX_BATCH as it does.
> > >
> > > On success: Same as now.
> > > On fail: let ->head grow and drain if possible, until reaching to
> > > KFREE_MAX_BATCH_FORCE.
>
> I should've explain this in more detail. This actually mean:
>
> On fail: Let ->head grow and queue rcu_work when ->head_free == NULL,
> until reaching to _FORCE.
>
> > > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching but handle one by
> > > one from now on to prevent too many pending requests from being
> > > queued for batching work.
>
> This mean:
>
> 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching requests to be added
> from now on but instead handle one by one to prevent too many
> pending requests
> from being queued. Of course, the requests already having been
> queued in ->head
> so far should be handled by rcu_work when it's possible which can
> be checked by
> the monitor or kfree_rcu() inside every call.
But does this really help? After all, the reason we have piled up a
large number of additional callbacks is likely because the grace period
is taking a long time, or because a huge number of callbacks has been
queued up. Sure, these callbacks might get a head start on the following
grace period, but at the expense of still retaining the kfree_rcu()
special cases in rcu_do_batch().
Another potential issue is interaction with rcu_barrier(). Currently,
rcu_barrier() waits for memory passed to prior kfree_rcu() calls to be
freed. This is useful to allow a large amount of memory be be completely
freed before allocating large amounts more memory. With the earlier
version of the patch, an rcu_barrier() followed by a flush_workqueue().
But #3 above would reorder the objects so that this approach might not
wait for everything.
We should therefore just let the second list grow. If experience shows
a need for callbacks to be sent up more quickly, it should be possible
to provide an additional list, so that two lists on a given CPU can both
be waiting for a grace period at the same time.
> > I also agree. But this _FORCE thing will still not solve the issue Paul is
> > raising which is doing this loop possibly in irq disabled / hardirq context.
>
> I added more explanation above. What I suggested is a way to avoid not
> only heavy
> work within the irq-disabled region of a single kfree_rcu() but also
> too many requests
> to be queued into ->head.
But let's start simple, please!
> > We can't even cond_resched() here. In fact since _FORCE is larger, it will be
> > even worse. Consider a real-time system with a lot of memory, in this case
> > letting ->head grow large is Ok, but looping for long time in IRQ disabled
> > would not be Ok.
>
> Please check the explanation above.
>
> > But I could make it something like:
> > 1. Letting ->head grow if ->head_free busy
> > 2. If head_free is busy, then just queue/requeue the monitor to try again.
>
> This is exactly what Paul said. The problem with this is ->head can grow too
> much. That's why I suggested the above one.
It can grow quite large, but how do you know that limiting its size will
really help? Sure, you have limited the size, but does that really do
anything for the larger problem of extreme kfree_rcu() rates on the one
hand and a desire for more efficient handling of kfree_rcu() on the other?
Thanx, Paul
> > This would even improve performance, but will still risk going out of memory.
> >
> > Thoughts?
> >
> > thanks,
> >
> > - Joel
> >
> > >
> > > This way, we can avoid both:
> > >
> > > 1. too many requests being queued and
> > > 2. __call_rcu() bunch of requests within a single kfree_rcu().
> > >
> > > Thanks,
> > > Byungchul
> > >
> > > >
> > > > But please feel free to come up with a better solution!
> > > >
> > > > [ . . . ]
>
>
>
> --
> Thanks,
> Byungchul
>
On Thu, Aug 08, 2019 at 07:26:10PM +0900, Byungchul Park wrote:
> On Wed, Aug 07, 2019 at 05:45:04AM -0400, Joel Fernandes wrote:
> > On Tue, Aug 06, 2019 at 04:56:31PM -0700, Paul E. McKenney wrote:
>
> [snip]
>
> > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > Of course, I am hoping that a later patch uses an array of pointers built
> > > at kfree_rcu() time, similar to Rao's patch (with or without kfree_bulk)
> > > in order to reduce per-object cache-miss overhead. This would make it
> > > easier for callback invocation to keep up with multi-CPU kfree_rcu()
> > > floods.
> >
> > I think Byungchul tried an experiment with array of pointers and wasn't
> > immediately able to see a benefit. Perhaps his patch needs a bit more polish
> > or another test-case needed to show benefit due to cache-misses, and the perf
> > tool could be used to show if cache misses were reduced. For this initial
> > pass, we decided to keep it without the array optimization.
>
> I'm still seeing no improvement with kfree_bulk().
>
> I've been thinking I could see improvement with kfree_bulk() because:
>
> 1. As you guys said, the number of cache misses will be reduced.
> 2. We can save (N - 1) irq-disable instructions while N kfrees.
> 3. As Joel said, saving/restoring CPU status that kfree() does inside
> is not required.
>
> But even with the following patch applied, the result was same as just
> batching test. We might need to get kmalloc objects from random
> addresses to maximize the result when using kfree_bulk() and this is
> even closer to real practical world too.
>
> And the second and third reasons doesn't seem to work as much as I
> expected.
>
> Do you have any idea? Or what do you think about it?
I would not expect kfree_batch() to help all that much unless the
pre-grace-period kfree_rcu() code segregated the objects on a per-slab
basis.
Thanx, Paul
> Thanks,
> Byungchul
>
> -----8<-----
> diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
> index 988e1ae..6f2ab06 100644
> --- a/kernel/rcu/rcuperf.c
> +++ b/kernel/rcu/rcuperf.c
> @@ -651,10 +651,10 @@ struct kfree_obj {
> return -ENOMEM;
> }
>
> - for (i = 0; i < kfree_alloc_num; i++) {
> - if (!kfree_no_batch) {
> - kfree_rcu(alloc_ptrs[i], rh);
> - } else {
> + if (!kfree_no_batch) {
> + kfree_bulk(kfree_alloc_num, (void **)alloc_ptrs);
> + } else {
> + for (i = 0; i < kfree_alloc_num; i++) {
> rcu_callback_t cb;
>
> cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh);
>
On Thu, Aug 08, 2019 at 11:11:12AM -0700, Paul E. McKenney wrote:
> On Thu, Aug 08, 2019 at 07:26:10PM +0900, Byungchul Park wrote:
> > On Wed, Aug 07, 2019 at 05:45:04AM -0400, Joel Fernandes wrote:
> > > On Tue, Aug 06, 2019 at 04:56:31PM -0700, Paul E. McKenney wrote:
> >
> > [snip]
> >
> > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > > Of course, I am hoping that a later patch uses an array of pointers built
> > > > at kfree_rcu() time, similar to Rao's patch (with or without kfree_bulk)
> > > > in order to reduce per-object cache-miss overhead. This would make it
> > > > easier for callback invocation to keep up with multi-CPU kfree_rcu()
> > > > floods.
> > >
> > > I think Byungchul tried an experiment with array of pointers and wasn't
> > > immediately able to see a benefit. Perhaps his patch needs a bit more polish
> > > or another test-case needed to show benefit due to cache-misses, and the perf
> > > tool could be used to show if cache misses were reduced. For this initial
> > > pass, we decided to keep it without the array optimization.
> >
> > I'm still seeing no improvement with kfree_bulk().
> >
> > I've been thinking I could see improvement with kfree_bulk() because:
> >
> > 1. As you guys said, the number of cache misses will be reduced.
> > 2. We can save (N - 1) irq-disable instructions while N kfrees.
> > 3. As Joel said, saving/restoring CPU status that kfree() does inside
> > is not required.
> >
> > But even with the following patch applied, the result was same as just
> > batching test. We might need to get kmalloc objects from random
> > addresses to maximize the result when using kfree_bulk() and this is
> > even closer to real practical world too.
> >
> > And the second and third reasons doesn't seem to work as much as I
> > expected.
> >
> > Do you have any idea? Or what do you think about it?
>
> I would not expect kfree_batch() to help all that much unless the
> pre-grace-period kfree_rcu() code segregated the objects on a per-slab
> basis.
You mean kfree_bulk() instead of kfree_batch() right? I agree with you, would
be nice to do per-slab optimization in the future.
Also, I am thinking that whenever we do per-slab optimization, then the
kmem_cache_free_bulk() can be optimized further. If all pointers are on the
same slab, then we can just do virt_to_cache on the first pointer and avoid
repeated virt_to_cache() calls. That might also give a benefit -- but I could
be missing something.
Right now kmem_cache_free_bulk() just looks like a kmem_cache_free() in a
loop except the small benefit of not disabling/enabling IRQs across each
__cache_free, and the reduced cache miss benefit of using the array.
thanks,
- Joel
[snip]
On Thu, Aug 08, 2019 at 04:13:33PM -0400, Joel Fernandes wrote:
> On Thu, Aug 08, 2019 at 11:11:12AM -0700, Paul E. McKenney wrote:
> > On Thu, Aug 08, 2019 at 07:26:10PM +0900, Byungchul Park wrote:
> > > On Wed, Aug 07, 2019 at 05:45:04AM -0400, Joel Fernandes wrote:
> > > > On Tue, Aug 06, 2019 at 04:56:31PM -0700, Paul E. McKenney wrote:
> > >
> > > [snip]
> > >
> > > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > > > Of course, I am hoping that a later patch uses an array of pointers built
> > > > > at kfree_rcu() time, similar to Rao's patch (with or without kfree_bulk)
> > > > > in order to reduce per-object cache-miss overhead. This would make it
> > > > > easier for callback invocation to keep up with multi-CPU kfree_rcu()
> > > > > floods.
> > > >
> > > > I think Byungchul tried an experiment with array of pointers and wasn't
> > > > immediately able to see a benefit. Perhaps his patch needs a bit more polish
> > > > or another test-case needed to show benefit due to cache-misses, and the perf
> > > > tool could be used to show if cache misses were reduced. For this initial
> > > > pass, we decided to keep it without the array optimization.
> > >
> > > I'm still seeing no improvement with kfree_bulk().
> > >
> > > I've been thinking I could see improvement with kfree_bulk() because:
> > >
> > > 1. As you guys said, the number of cache misses will be reduced.
> > > 2. We can save (N - 1) irq-disable instructions while N kfrees.
> > > 3. As Joel said, saving/restoring CPU status that kfree() does inside
> > > is not required.
> > >
> > > But even with the following patch applied, the result was same as just
> > > batching test. We might need to get kmalloc objects from random
> > > addresses to maximize the result when using kfree_bulk() and this is
> > > even closer to real practical world too.
> > >
> > > And the second and third reasons doesn't seem to work as much as I
> > > expected.
> > >
> > > Do you have any idea? Or what do you think about it?
> >
> > I would not expect kfree_batch() to help all that much unless the
> > pre-grace-period kfree_rcu() code segregated the objects on a per-slab
> > basis.
>
> You mean kfree_bulk() instead of kfree_batch() right? I agree with you, would
> be nice to do per-slab optimization in the future.
Indeed I do mean kfree_bulk()! One of those mornings, I guess...
But again, without the per-slab locality, I doubt that we will see much
improvement from kfree_bulk() over kfree().
> Also, I am thinking that whenever we do per-slab optimization, then the
> kmem_cache_free_bulk() can be optimized further. If all pointers are on the
> same slab, then we can just do virt_to_cache on the first pointer and avoid
> repeated virt_to_cache() calls. That might also give a benefit -- but I could
> be missing something.
A sort might be required to make that work nicely, which would add some
overhead. Probably not that much, though, the increased locality would
have a fighting chance of overcoming the sort's overhead.
> Right now kmem_cache_free_bulk() just looks like a kmem_cache_free() in a
> loop except the small benefit of not disabling/enabling IRQs across each
> __cache_free, and the reduced cache miss benefit of using the array.
C'mon! Show some respect for the awesome power of temporal locality!!! ;-)
Thanx, Paul
On Thu, Aug 08, 2019 at 01:51:29PM -0700, Paul E. McKenney wrote:
[snip]
> > Also, I am thinking that whenever we do per-slab optimization, then the
> > kmem_cache_free_bulk() can be optimized further. If all pointers are on the
> > same slab, then we can just do virt_to_cache on the first pointer and avoid
> > repeated virt_to_cache() calls. That might also give a benefit -- but I could
> > be missing something.
>
> A sort might be required to make that work nicely, which would add some
> overhead. Probably not that much, though, the increased locality would
> have a fighting chance of overcoming the sort's overhead.
>
> > Right now kmem_cache_free_bulk() just looks like a kmem_cache_free() in a
> > loop except the small benefit of not disabling/enabling IRQs across each
> > __cache_free, and the reduced cache miss benefit of using the array.
>
> C'mon! Show some respect for the awesome power of temporal locality!!! ;-)
Good point. I will try to respect it more in the future ;-) After all, it is
quite a useful concept.
thanks,
- Joel
On Thu, Aug 08, 2019 at 06:34:15PM -0400, Joel Fernandes wrote:
> On Thu, Aug 08, 2019 at 01:51:29PM -0700, Paul E. McKenney wrote:
> [snip]
> > > Also, I am thinking that whenever we do per-slab optimization, then the
> > > kmem_cache_free_bulk() can be optimized further. If all pointers are on the
> > > same slab, then we can just do virt_to_cache on the first pointer and avoid
> > > repeated virt_to_cache() calls. That might also give a benefit -- but I could
> > > be missing something.
> >
> > A sort might be required to make that work nicely, which would add some
> > overhead. Probably not that much, though, the increased locality would
> > have a fighting chance of overcoming the sort's overhead.
> >
> > > Right now kmem_cache_free_bulk() just looks like a kmem_cache_free() in a
> > > loop except the small benefit of not disabling/enabling IRQs across each
> > > __cache_free, and the reduced cache miss benefit of using the array.
> >
> > C'mon! Show some respect for the awesome power of temporal locality!!! ;-)
>
> Good point. I will try to respect it more in the future ;-) After all, it is
> quite a useful concept.
;-) ;-) ;-)
It still has to prove itself in real benchmarks, of course!
Thanx, Paul
On Thu, Aug 08, 2019 at 08:56:07AM -0400, Joel Fernandes wrote:
> On Thu, Aug 08, 2019 at 06:52:32PM +0900, Byungchul Park wrote:
> > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > [ . . . ]
> > > > > > + for (; head; head = next) {
> > > > > > + next = head->next;
> > > > > > + head->next = NULL;
> > > > > > + __call_rcu(head, head->func, -1, 1);
> > > > >
> > > > > We need at least a cond_resched() here. 200,000 times through this loop
> > > > > in a PREEMPT=n kernel might not always be pretty. Except that this is
> > > > > invoked directly from kfree_rcu() which might be invoked with interrupts
> > > > > disabled, which precludes calls to cond_resched(). So the realtime guys
> > > > > are not going to be at all happy with this loop.
> > > >
> > > > Ok, will add this here.
> > > >
> > > > > And this loop could be avoided entirely by having a third rcu_head list
> > > > > in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> > > > > KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> > > > > should be OK, or at least more OK than queuing 200,000 callbacks with
> > > > > interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> > > > > pointers can be used to reduce the probability of oversized batches.)
> > > > > This would also mean that the equality comparisons with KFREE_MAX_BATCH
> > > > > need to become greater-or-equal comparisons or some such.
> > > >
> > > > Yes, certainly we can do these kinds of improvements after this patch, and
> > > > then add more tests to validate the improvements.
> > >
> > > Out of pity for people bisecting, we need this fixed up front.
> > >
> > > My suggestion is to just allow ->head to grow until ->head_free becomes
> > > available. That way you are looping with interrupts and preemption
> > > enabled in workqueue context, which is much less damaging than doing so
> > > with interrupts disabled, and possibly even from hard-irq context.
> >
> > Agree.
> >
> > Or after introducing another limit like KFREE_MAX_BATCH_FORCE(>=
> > KFREE_MAX_BATCH):
> >
> > 1. Try to drain it on hitting KFREE_MAX_BATCH as it does.
> >
> > On success: Same as now.
> > On fail: let ->head grow and drain if possible, until reaching to
> > KFREE_MAX_BATCH_FORCE.
> >
> > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching but handle one by
> > one from now on to prevent too many pending requests from being
> > queued for batching work.
>
> I also agree. But this _FORCE thing will still not solve the issue Paul is
> raising which is doing this loop possibly in irq disabled / hardirq context.
> We can't even cond_resched() here. In fact since _FORCE is larger, it will be
> even worse. Consider a real-time system with a lot of memory, in this case
> letting ->head grow large is Ok, but looping for long time in IRQ disabled
> would not be Ok.
>
> But I could make it something like:
> 1. Letting ->head grow if ->head_free busy
> 2. If head_free is busy, then just queue/requeue the monitor to try again.
>
> This would even improve performance, but will still risk going out of memory.
It seems I can indeed hit an out of memory condition once I changed it to
"letting list grow" (diff is below which applies on top of this patch) while
at the same time removing the schedule_timeout(2) and replacing it with
cond_resched() in the rcuperf test. I think the reason is the rcuperf test
starves the worker threads that are executing in workqueue context after a
grace period and those are unable to get enough CPU time to kfree things fast
enough. But I am not fully sure about it and need to test/trace more to
figure out why this is happening.
If I add back the schedule_uninterruptibe_timeout(2) call, the out of memory
situation goes away.
Clearly we need to do more work on this patch.
In the regular kfree_rcu_no_batch() case, I don't hit this issue. I believe
that since the kfree is happening in softirq context in the _no_batch() case,
it fares better. The question then I guess is how do we run the rcu_work in a
higher priority context so it is not starved and runs often enough. I'll
trace more.
Perhaps I can also lower the priority of the rcuperf threads to give the
worker thread some more room to run and see if anything changes. But I am not
sure then if we're preparing the code for the real world with such
modifications.
Any thoughts?
thanks,
- Joel
---8<-----------------------
From 098d62e5a1b84a11139236c9b1f59e7f32289b40 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <[email protected]>
Date: Thu, 8 Aug 2019 16:29:58 -0400
Subject: [PATCH] Let list grow
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
kernel/rcu/rcuperf.c | 2 +-
kernel/rcu/tree.c | 52 +++++++++++++++++++-------------------------
2 files changed, 23 insertions(+), 31 deletions(-)
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 34658760da5e..7dc831db89ae 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -654,7 +654,7 @@ kfree_perf_thread(void *arg)
}
}
- schedule_timeout_uninterruptible(2);
+ cond_resched();
} while (!torture_must_stop() && ++l < kfree_loops);
kfree(alloc_ptrs);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bdbd483606ce..bab77220d8ac 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2595,7 +2595,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
/* Maximum number of jiffies to wait before draining batch */
-#define KFREE_DRAIN_JIFFIES 50
+#define KFREE_DRAIN_JIFFIES (HZ / 20)
/*
* Maximum number of kfree(s) to batch, if limit is hit
@@ -2684,27 +2684,19 @@ static void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krc,
{
struct rcu_head *head, *next;
- /* It is time to do bulk reclaim after grace period */
- krc->monitor_todo = false;
+ /* It is time to do bulk reclaim after grace period. */
if (queue_kfree_rcu_work(krc)) {
spin_unlock_irqrestore(&krc->lock, flags);
return;
}
- /*
- * Use non-batch regular call_rcu for kfree_rcu in case things are too
- * busy and batching of kfree_rcu could not be used.
- */
- head = krc->head;
- krc->head = NULL;
- krc->kfree_batch_len = 0;
- spin_unlock_irqrestore(&krc->lock, flags);
-
- for (; head; head = next) {
- next = head->next;
- head->next = NULL;
- __call_rcu(head, head->func, -1, 1);
+ /* Previous batch did not get free yet, let us try again soon. */
+ if (krc->monitor_todo == false) {
+ schedule_delayed_work_on(smp_processor_id(),
+ &krc->monitor_work, KFREE_DRAIN_JIFFIES/4);
+ krc->monitor_todo = true;
}
+ spin_unlock_irqrestore(&krc->lock, flags);
}
/*
--
2.23.0.rc1.153.gdeed80330f-goog
On Thu, Aug 08, 2019 at 07:30:14PM -0400, Joel Fernandes wrote:
> On Thu, Aug 08, 2019 at 08:56:07AM -0400, Joel Fernandes wrote:
> > On Thu, Aug 08, 2019 at 06:52:32PM +0900, Byungchul Park wrote:
> > > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > > [ . . . ]
> > > > > > > + for (; head; head = next) {
> > > > > > > + next = head->next;
> > > > > > > + head->next = NULL;
> > > > > > > + __call_rcu(head, head->func, -1, 1);
> > > > > >
> > > > > > We need at least a cond_resched() here. 200,000 times through this loop
> > > > > > in a PREEMPT=n kernel might not always be pretty. Except that this is
> > > > > > invoked directly from kfree_rcu() which might be invoked with interrupts
> > > > > > disabled, which precludes calls to cond_resched(). So the realtime guys
> > > > > > are not going to be at all happy with this loop.
> > > > >
> > > > > Ok, will add this here.
> > > > >
> > > > > > And this loop could be avoided entirely by having a third rcu_head list
> > > > > > in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> > > > > > KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> > > > > > should be OK, or at least more OK than queuing 200,000 callbacks with
> > > > > > interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> > > > > > pointers can be used to reduce the probability of oversized batches.)
> > > > > > This would also mean that the equality comparisons with KFREE_MAX_BATCH
> > > > > > need to become greater-or-equal comparisons or some such.
> > > > >
> > > > > Yes, certainly we can do these kinds of improvements after this patch, and
> > > > > then add more tests to validate the improvements.
> > > >
> > > > Out of pity for people bisecting, we need this fixed up front.
> > > >
> > > > My suggestion is to just allow ->head to grow until ->head_free becomes
> > > > available. That way you are looping with interrupts and preemption
> > > > enabled in workqueue context, which is much less damaging than doing so
> > > > with interrupts disabled, and possibly even from hard-irq context.
> > >
> > > Agree.
> > >
> > > Or after introducing another limit like KFREE_MAX_BATCH_FORCE(>=
> > > KFREE_MAX_BATCH):
> > >
> > > 1. Try to drain it on hitting KFREE_MAX_BATCH as it does.
> > >
> > > On success: Same as now.
> > > On fail: let ->head grow and drain if possible, until reaching to
> > > KFREE_MAX_BATCH_FORCE.
> > >
> > > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching but handle one by
> > > one from now on to prevent too many pending requests from being
> > > queued for batching work.
> >
> > I also agree. But this _FORCE thing will still not solve the issue Paul is
> > raising which is doing this loop possibly in irq disabled / hardirq context.
> > We can't even cond_resched() here. In fact since _FORCE is larger, it will be
> > even worse. Consider a real-time system with a lot of memory, in this case
> > letting ->head grow large is Ok, but looping for long time in IRQ disabled
> > would not be Ok.
> >
> > But I could make it something like:
> > 1. Letting ->head grow if ->head_free busy
> > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> >
> > This would even improve performance, but will still risk going out of memory.
>
> It seems I can indeed hit an out of memory condition once I changed it to
> "letting list grow" (diff is below which applies on top of this patch) while
> at the same time removing the schedule_timeout(2) and replacing it with
> cond_resched() in the rcuperf test. I think the reason is the rcuperf test
> starves the worker threads that are executing in workqueue context after a
> grace period and those are unable to get enough CPU time to kfree things fast
> enough. But I am not fully sure about it and need to test/trace more to
> figure out why this is happening.
>
> If I add back the schedule_uninterruptibe_timeout(2) call, the out of memory
> situation goes away.
>
> Clearly we need to do more work on this patch.
>
> In the regular kfree_rcu_no_batch() case, I don't hit this issue. I believe
> that since the kfree is happening in softirq context in the _no_batch() case,
> it fares better. The question then I guess is how do we run the rcu_work in a
> higher priority context so it is not starved and runs often enough. I'll
> trace more.
>
> Perhaps I can also lower the priority of the rcuperf threads to give the
> worker thread some more room to run and see if anything changes. But I am not
> sure then if we're preparing the code for the real world with such
> modifications.
>
> Any thoughts?
Several! With luck, perhaps some are useful. ;-)
o Increase the memory via kvm.sh "--memory 1G" or more. The
default is "--memory 500M".
o Leave a CPU free to run things like the RCU grace-period kthread.
You might also need to bind that kthread to that CPU.
o Alternatively, use the "rcutree.kthread_prio=" boot parameter to
boost the RCU kthreads to real-time priority. This won't do
anything for ksoftirqd, though.
o Along with the above boot parameter, use "rcutree.use_softirq=0"
to cause RCU to use kthreads instead of softirq. (You might well
find issues in priority setting as well, but might as well find
them now if so!)
o With any of the above, invoke rcu_momentary_dyntick_idle() along
with cond_resched() in your kfree_rcu() loop. This simulates
a trip to userspace for nohz_full CPUs, so if this helps for
non-nohz_full CPUs, adjustments to the kernel might be called for.
Probably others, but this should do for a start.
Thanx, Paul
> thanks,
>
> - Joel
>
> ---8<-----------------------
>
> >From 098d62e5a1b84a11139236c9b1f59e7f32289b40 Mon Sep 17 00:00:00 2001
> From: "Joel Fernandes (Google)" <[email protected]>
> Date: Thu, 8 Aug 2019 16:29:58 -0400
> Subject: [PATCH] Let list grow
>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> kernel/rcu/rcuperf.c | 2 +-
> kernel/rcu/tree.c | 52 +++++++++++++++++++-------------------------
> 2 files changed, 23 insertions(+), 31 deletions(-)
>
> diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
> index 34658760da5e..7dc831db89ae 100644
> --- a/kernel/rcu/rcuperf.c
> +++ b/kernel/rcu/rcuperf.c
> @@ -654,7 +654,7 @@ kfree_perf_thread(void *arg)
> }
> }
>
> - schedule_timeout_uninterruptible(2);
> + cond_resched();
> } while (!torture_must_stop() && ++l < kfree_loops);
>
> kfree(alloc_ptrs);
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index bdbd483606ce..bab77220d8ac 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2595,7 +2595,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
>
>
> /* Maximum number of jiffies to wait before draining batch */
> -#define KFREE_DRAIN_JIFFIES 50
> +#define KFREE_DRAIN_JIFFIES (HZ / 20)
>
> /*
> * Maximum number of kfree(s) to batch, if limit is hit
> @@ -2684,27 +2684,19 @@ static void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krc,
> {
> struct rcu_head *head, *next;
>
> - /* It is time to do bulk reclaim after grace period */
> - krc->monitor_todo = false;
> + /* It is time to do bulk reclaim after grace period. */
> if (queue_kfree_rcu_work(krc)) {
> spin_unlock_irqrestore(&krc->lock, flags);
> return;
> }
>
> - /*
> - * Use non-batch regular call_rcu for kfree_rcu in case things are too
> - * busy and batching of kfree_rcu could not be used.
> - */
> - head = krc->head;
> - krc->head = NULL;
> - krc->kfree_batch_len = 0;
> - spin_unlock_irqrestore(&krc->lock, flags);
> -
> - for (; head; head = next) {
> - next = head->next;
> - head->next = NULL;
> - __call_rcu(head, head->func, -1, 1);
> + /* Previous batch did not get free yet, let us try again soon. */
> + if (krc->monitor_todo == false) {
> + schedule_delayed_work_on(smp_processor_id(),
> + &krc->monitor_work, KFREE_DRAIN_JIFFIES/4);
> + krc->monitor_todo = true;
> }
> + spin_unlock_irqrestore(&krc->lock, flags);
> }
>
> /*
> --
> 2.23.0.rc1.153.gdeed80330f-goog
>
On Fri, Aug 09, 2019 at 08:16:19AM -0700, Paul E. McKenney wrote:
> On Thu, Aug 08, 2019 at 07:30:14PM -0400, Joel Fernandes wrote:
[snip]
> > > But I could make it something like:
> > > 1. Letting ->head grow if ->head_free busy
> > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> > >
> > > This would even improve performance, but will still risk going out of memory.
> >
> > It seems I can indeed hit an out of memory condition once I changed it to
> > "letting list grow" (diff is below which applies on top of this patch) while
> > at the same time removing the schedule_timeout(2) and replacing it with
> > cond_resched() in the rcuperf test. I think the reason is the rcuperf test
> > starves the worker threads that are executing in workqueue context after a
> > grace period and those are unable to get enough CPU time to kfree things fast
> > enough. But I am not fully sure about it and need to test/trace more to
> > figure out why this is happening.
> >
> > If I add back the schedule_uninterruptibe_timeout(2) call, the out of memory
> > situation goes away.
> >
> > Clearly we need to do more work on this patch.
> >
> > In the regular kfree_rcu_no_batch() case, I don't hit this issue. I believe
> > that since the kfree is happening in softirq context in the _no_batch() case,
> > it fares better. The question then I guess is how do we run the rcu_work in a
> > higher priority context so it is not starved and runs often enough. I'll
> > trace more.
> >
> > Perhaps I can also lower the priority of the rcuperf threads to give the
> > worker thread some more room to run and see if anything changes. But I am not
> > sure then if we're preparing the code for the real world with such
> > modifications.
> >
> > Any thoughts?
>
> Several! With luck, perhaps some are useful. ;-)
>
> o Increase the memory via kvm.sh "--memory 1G" or more. The
> default is "--memory 500M".
Thanks, this definitely helped.
> o Leave a CPU free to run things like the RCU grace-period kthread.
> You might also need to bind that kthread to that CPU.
>
> o Alternatively, use the "rcutree.kthread_prio=" boot parameter to
> boost the RCU kthreads to real-time priority. This won't do
> anything for ksoftirqd, though.
I will try these as well.
>
> o Along with the above boot parameter, use "rcutree.use_softirq=0"
> to cause RCU to use kthreads instead of softirq. (You might well
> find issues in priority setting as well, but might as well find
> them now if so!)
Doesn't think one actually reduce the priority of the core RCU work? softirq
will always have higher priority than any there. So wouldn't that have the
effect of not reclaiming things fast enough? (Or, in my case not scheduling
the rcu_work which does the reclaim).
> o With any of the above, invoke rcu_momentary_dyntick_idle() along
> with cond_resched() in your kfree_rcu() loop. This simulates
> a trip to userspace for nohz_full CPUs, so if this helps for
> non-nohz_full CPUs, adjustments to the kernel might be called for.
Ok, will try it.
Save these bullet points for future reference! ;-) thanks,
- Joel
>
> Probably others, but this should do for a start.
>
> Thanx, Paul
>
> > thanks,
> >
> > - Joel
> >
> > ---8<-----------------------
> >
> > >From 098d62e5a1b84a11139236c9b1f59e7f32289b40 Mon Sep 17 00:00:00 2001
> > From: "Joel Fernandes (Google)" <[email protected]>
> > Date: Thu, 8 Aug 2019 16:29:58 -0400
> > Subject: [PATCH] Let list grow
> >
> > Signed-off-by: Joel Fernandes (Google) <[email protected]>
> > ---
> > kernel/rcu/rcuperf.c | 2 +-
> > kernel/rcu/tree.c | 52 +++++++++++++++++++-------------------------
> > 2 files changed, 23 insertions(+), 31 deletions(-)
> >
> > diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
> > index 34658760da5e..7dc831db89ae 100644
> > --- a/kernel/rcu/rcuperf.c
> > +++ b/kernel/rcu/rcuperf.c
> > @@ -654,7 +654,7 @@ kfree_perf_thread(void *arg)
> > }
> > }
> >
> > - schedule_timeout_uninterruptible(2);
> > + cond_resched();
> > } while (!torture_must_stop() && ++l < kfree_loops);
> >
> > kfree(alloc_ptrs);
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index bdbd483606ce..bab77220d8ac 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -2595,7 +2595,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
> >
> >
> > /* Maximum number of jiffies to wait before draining batch */
> > -#define KFREE_DRAIN_JIFFIES 50
> > +#define KFREE_DRAIN_JIFFIES (HZ / 20)
> >
> > /*
> > * Maximum number of kfree(s) to batch, if limit is hit
> > @@ -2684,27 +2684,19 @@ static void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krc,
> > {
> > struct rcu_head *head, *next;
> >
> > - /* It is time to do bulk reclaim after grace period */
> > - krc->monitor_todo = false;
> > + /* It is time to do bulk reclaim after grace period. */
> > if (queue_kfree_rcu_work(krc)) {
> > spin_unlock_irqrestore(&krc->lock, flags);
> > return;
> > }
> >
> > - /*
> > - * Use non-batch regular call_rcu for kfree_rcu in case things are too
> > - * busy and batching of kfree_rcu could not be used.
> > - */
> > - head = krc->head;
> > - krc->head = NULL;
> > - krc->kfree_batch_len = 0;
> > - spin_unlock_irqrestore(&krc->lock, flags);
> > -
> > - for (; head; head = next) {
> > - next = head->next;
> > - head->next = NULL;
> > - __call_rcu(head, head->func, -1, 1);
> > + /* Previous batch did not get free yet, let us try again soon. */
> > + if (krc->monitor_todo == false) {
> > + schedule_delayed_work_on(smp_processor_id(),
> > + &krc->monitor_work, KFREE_DRAIN_JIFFIES/4);
> > + krc->monitor_todo = true;
> > }
> > + spin_unlock_irqrestore(&krc->lock, flags);
> > }
> >
> > /*
> > --
> > 2.23.0.rc1.153.gdeed80330f-goog
> >
>
On Fri, Aug 09, 2019 at 11:39:24AM -0400, Joel Fernandes wrote:
> On Fri, Aug 09, 2019 at 08:16:19AM -0700, Paul E. McKenney wrote:
> > On Thu, Aug 08, 2019 at 07:30:14PM -0400, Joel Fernandes wrote:
> [snip]
> > > > But I could make it something like:
> > > > 1. Letting ->head grow if ->head_free busy
> > > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> > > >
> > > > This would even improve performance, but will still risk going out of memory.
> > >
> > > It seems I can indeed hit an out of memory condition once I changed it to
> > > "letting list grow" (diff is below which applies on top of this patch) while
> > > at the same time removing the schedule_timeout(2) and replacing it with
> > > cond_resched() in the rcuperf test. I think the reason is the rcuperf test
> > > starves the worker threads that are executing in workqueue context after a
> > > grace period and those are unable to get enough CPU time to kfree things fast
> > > enough. But I am not fully sure about it and need to test/trace more to
> > > figure out why this is happening.
> > >
> > > If I add back the schedule_uninterruptibe_timeout(2) call, the out of memory
> > > situation goes away.
> > >
> > > Clearly we need to do more work on this patch.
> > >
> > > In the regular kfree_rcu_no_batch() case, I don't hit this issue. I believe
> > > that since the kfree is happening in softirq context in the _no_batch() case,
> > > it fares better. The question then I guess is how do we run the rcu_work in a
> > > higher priority context so it is not starved and runs often enough. I'll
> > > trace more.
> > >
> > > Perhaps I can also lower the priority of the rcuperf threads to give the
> > > worker thread some more room to run and see if anything changes. But I am not
> > > sure then if we're preparing the code for the real world with such
> > > modifications.
> > >
> > > Any thoughts?
> >
> > Several! With luck, perhaps some are useful. ;-)
> >
> > o Increase the memory via kvm.sh "--memory 1G" or more. The
> > default is "--memory 500M".
>
> Thanks, this definitely helped.
>
> > o Leave a CPU free to run things like the RCU grace-period kthread.
> > You might also need to bind that kthread to that CPU.
> >
> > o Alternatively, use the "rcutree.kthread_prio=" boot parameter to
> > boost the RCU kthreads to real-time priority. This won't do
> > anything for ksoftirqd, though.
>
> I will try these as well.
>
> >
> > o Along with the above boot parameter, use "rcutree.use_softirq=0"
> > to cause RCU to use kthreads instead of softirq. (You might well
> > find issues in priority setting as well, but might as well find
> > them now if so!)
>
> Doesn't think one actually reduce the priority of the core RCU work? softirq
> will always have higher priority than any there. So wouldn't that have the
> effect of not reclaiming things fast enough? (Or, in my case not scheduling
> the rcu_work which does the reclaim).
For low kfree_rcu() loads, yes, it increases overhead due to the need
for context switches instead of softirq running at the tail end of an
interrupt. But for high kfree_rcu() loads, it gets you realtime priority
(in conjunction with "rcutree.kthread_prio=", that is).
> > o With any of the above, invoke rcu_momentary_dyntick_idle() along
> > with cond_resched() in your kfree_rcu() loop. This simulates
> > a trip to userspace for nohz_full CPUs, so if this helps for
> > non-nohz_full CPUs, adjustments to the kernel might be called for.
>
> Ok, will try it.
>
> Save these bullet points for future reference! ;-) thanks,
I guess this is helping me to prepare for Plumbers. ;-)
Thanx, Paul
> - Joel
>
>
> >
> > Probably others, but this should do for a start.
> >
> > Thanx, Paul
> >
> > > thanks,
> > >
> > > - Joel
> > >
> > > ---8<-----------------------
> > >
> > > >From 098d62e5a1b84a11139236c9b1f59e7f32289b40 Mon Sep 17 00:00:00 2001
> > > From: "Joel Fernandes (Google)" <[email protected]>
> > > Date: Thu, 8 Aug 2019 16:29:58 -0400
> > > Subject: [PATCH] Let list grow
> > >
> > > Signed-off-by: Joel Fernandes (Google) <[email protected]>
> > > ---
> > > kernel/rcu/rcuperf.c | 2 +-
> > > kernel/rcu/tree.c | 52 +++++++++++++++++++-------------------------
> > > 2 files changed, 23 insertions(+), 31 deletions(-)
> > >
> > > diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
> > > index 34658760da5e..7dc831db89ae 100644
> > > --- a/kernel/rcu/rcuperf.c
> > > +++ b/kernel/rcu/rcuperf.c
> > > @@ -654,7 +654,7 @@ kfree_perf_thread(void *arg)
> > > }
> > > }
> > >
> > > - schedule_timeout_uninterruptible(2);
> > > + cond_resched();
> > > } while (!torture_must_stop() && ++l < kfree_loops);
> > >
> > > kfree(alloc_ptrs);
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index bdbd483606ce..bab77220d8ac 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -2595,7 +2595,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
> > >
> > >
> > > /* Maximum number of jiffies to wait before draining batch */
> > > -#define KFREE_DRAIN_JIFFIES 50
> > > +#define KFREE_DRAIN_JIFFIES (HZ / 20)
> > >
> > > /*
> > > * Maximum number of kfree(s) to batch, if limit is hit
> > > @@ -2684,27 +2684,19 @@ static void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krc,
> > > {
> > > struct rcu_head *head, *next;
> > >
> > > - /* It is time to do bulk reclaim after grace period */
> > > - krc->monitor_todo = false;
> > > + /* It is time to do bulk reclaim after grace period. */
> > > if (queue_kfree_rcu_work(krc)) {
> > > spin_unlock_irqrestore(&krc->lock, flags);
> > > return;
> > > }
> > >
> > > - /*
> > > - * Use non-batch regular call_rcu for kfree_rcu in case things are too
> > > - * busy and batching of kfree_rcu could not be used.
> > > - */
> > > - head = krc->head;
> > > - krc->head = NULL;
> > > - krc->kfree_batch_len = 0;
> > > - spin_unlock_irqrestore(&krc->lock, flags);
> > > -
> > > - for (; head; head = next) {
> > > - next = head->next;
> > > - head->next = NULL;
> > > - __call_rcu(head, head->func, -1, 1);
> > > + /* Previous batch did not get free yet, let us try again soon. */
> > > + if (krc->monitor_todo == false) {
> > > + schedule_delayed_work_on(smp_processor_id(),
> > > + &krc->monitor_work, KFREE_DRAIN_JIFFIES/4);
> > > + krc->monitor_todo = true;
> > > }
> > > + spin_unlock_irqrestore(&krc->lock, flags);
> > > }
> > >
> > > /*
> > > --
> > > 2.23.0.rc1.153.gdeed80330f-goog
> > >
> >
>
On Wed, Aug 07, 2019 at 10:56:57AM -0700, Paul E. McKenney wrote:
> On Wed, Aug 07, 2019 at 06:22:13AM -0400, Joel Fernandes wrote:
> > On Tue, Aug 06, 2019 at 05:29:15PM -0700, Paul E. McKenney wrote:
> > > On Tue, Aug 06, 2019 at 05:20:41PM -0400, Joel Fernandes (Google) wrote:
> > > > This test runs kfree_rcu in a loop to measure performance of the new
> > > > kfree_rcu, with and without patch.
> > > >
> > > > To see improvement, run with boot parameters:
> > > > rcuperf.kfree_loops=2000 rcuperf.kfree_alloc_num=100 rcuperf.perf_type=kfree
> > > >
> > > > Without patch, test runs in 6.9 seconds.
> > > > With patch, test runs in 6.1 seconds (+13% improvement)
> > > >
> > > > If it is desired to run the test but with the traditional (non-batched)
> > > > kfree_rcu, for example to compare results, then you could pass along the
> > > > rcuperf.kfree_no_batch=1 boot parameter.
> > >
> > > You lost me on this one. You ran two runs, with rcuperf.kfree_no_batch=1
> > > and without? Or you ran this patch both with and without the earlier
> > > patch, and could have run with the patch and rcuperf.kfree_no_batch=1?
> >
> > I always run the rcutorture test with patch because the patch doesn't really
> > do anything if rcuperf.kfree_no_batch=0. This parameter is added so that in
> > the future folks can compare effect of non-batching with that of the
> > batching. However, I can also remove the patch itself and run this test
> > again.
> >
> > > If the latter, it would be good to try all three.
> >
> > Ok, sure.
>
> Very good! And please make the commit log more clear. ;-)
Sure will do :)
> > > > + long me = (long)arg;
> > > > + struct kfree_obj **alloc_ptrs;
> > > > + u64 start_time, end_time;
> > > > +
> > > > + VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
> > > > + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
> > > > + set_user_nice(current, MAX_NICE);
> > > > + atomic_inc(&n_kfree_perf_thread_started);
> > > > +
> > > > + alloc_ptrs = (struct kfree_obj **)kmalloc(sizeof(struct kfree_obj *) * kfree_alloc_num,
> > > > + GFP_KERNEL);
> > > > + if (!alloc_ptrs)
> > > > + return -ENOMEM;
> > > > +
> > > > + start_time = ktime_get_mono_fast_ns();
> > >
> > > Don't you want to announce that you started here rather than above in
> > > order to avoid (admittedly slight) measurement inaccuracies?
> >
> > I did not follow, are you referring to the measurement inaccuracy related to
> > the "kfree_perf_thread task started" string print? Or, are you saying that
> > ktime_get_mono_fast_ns() has to start earlier than over here?
>
> I am referring to the atomic_inc().
Oh yes, great catch. I will increment closer to the test's actual start.
thanks!
> > (I will reply to the rest of the comments below in a bit, I am going to a
> > hospital now to visit a sick relative and will be back a bit later.)
>
> Ouch!!! I hope that goes as well as it possibly can! And please don't
> neglect your relative on RCU's account!!!
Thanks! it went quite well and now I am back to work ;-)
- Joel
On Fri, Aug 09, 2019 at 09:33:46AM -0700, Paul E. McKenney wrote:
> On Fri, Aug 09, 2019 at 11:39:24AM -0400, Joel Fernandes wrote:
> > On Fri, Aug 09, 2019 at 08:16:19AM -0700, Paul E. McKenney wrote:
> > > On Thu, Aug 08, 2019 at 07:30:14PM -0400, Joel Fernandes wrote:
> > [snip]
> > > > > But I could make it something like:
> > > > > 1. Letting ->head grow if ->head_free busy
> > > > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> > > > >
> > > > > This would even improve performance, but will still risk going out of memory.
> > > >
> > > > It seems I can indeed hit an out of memory condition once I changed it to
> > > > "letting list grow" (diff is below which applies on top of this patch) while
> > > > at the same time removing the schedule_timeout(2) and replacing it with
> > > > cond_resched() in the rcuperf test. I think the reason is the rcuperf test
> > > > starves the worker threads that are executing in workqueue context after a
> > > > grace period and those are unable to get enough CPU time to kfree things fast
> > > > enough. But I am not fully sure about it and need to test/trace more to
> > > > figure out why this is happening.
> > > >
> > > > If I add back the schedule_uninterruptibe_timeout(2) call, the out of memory
> > > > situation goes away.
> > > >
> > > > Clearly we need to do more work on this patch.
> > > >
> > > > In the regular kfree_rcu_no_batch() case, I don't hit this issue. I believe
> > > > that since the kfree is happening in softirq context in the _no_batch() case,
> > > > it fares better. The question then I guess is how do we run the rcu_work in a
> > > > higher priority context so it is not starved and runs often enough. I'll
> > > > trace more.
> > > >
> > > > Perhaps I can also lower the priority of the rcuperf threads to give the
> > > > worker thread some more room to run and see if anything changes. But I am not
> > > > sure then if we're preparing the code for the real world with such
> > > > modifications.
> > > >
> > > > Any thoughts?
> > >
> > > Several! With luck, perhaps some are useful. ;-)
> > >
> > > o Increase the memory via kvm.sh "--memory 1G" or more. The
> > > default is "--memory 500M".
> >
> > Thanks, this definitely helped.
Also, I can go back to 500M if I just keep KFREE_DRAIN_JIFFIES at HZ/50. So I
am quite happy about that. I think I can declare that the "let list grow
indefinitely" design works quite well even with an insanely heavily loaded
case of every CPU in a 16CPU system with 500M memory, indefinitely doing
kfree_rcu()in a tight loop with appropriate cond_resched(). And I am like
thinking - wow how does this stuff even work at such insane scales :-D
> > > o Leave a CPU free to run things like the RCU grace-period kthread.
> > > You might also need to bind that kthread to that CPU.
> > >
> > > o Alternatively, use the "rcutree.kthread_prio=" boot parameter to
> > > boost the RCU kthreads to real-time priority. This won't do
> > > anything for ksoftirqd, though.
> >
> > I will try these as well.
> >
kthread_prio=50 definitely reduced the probability of OOM but it still
occurred.
> > > o Along with the above boot parameter, use "rcutree.use_softirq=0"
> > > to cause RCU to use kthreads instead of softirq. (You might well
> > > find issues in priority setting as well, but might as well find
> > > them now if so!)
> >
> > Doesn't think one actually reduce the priority of the core RCU work? softirq
> > will always have higher priority than any there. So wouldn't that have the
> > effect of not reclaiming things fast enough? (Or, in my case not scheduling
> > the rcu_work which does the reclaim).
>
> For low kfree_rcu() loads, yes, it increases overhead due to the need
> for context switches instead of softirq running at the tail end of an
> interrupt. But for high kfree_rcu() loads, it gets you realtime priority
> (in conjunction with "rcutree.kthread_prio=", that is).
I meant for high kfree_rcu() loads, a softirq context executing RCU callback
is still better from the point of view of the callback running because the
softirq will run above all else (higher than the highest priority task) so
use_softirq=0 would be a down grade from that perspective if something higher
than rcutree.kthread_prio is running on the CPU. So unless kthread_prio is
set to the highest prio, then softirq running would work better. Did I miss
something?
> > > o With any of the above, invoke rcu_momentary_dyntick_idle() along
> > > with cond_resched() in your kfree_rcu() loop. This simulates
> > > a trip to userspace for nohz_full CPUs, so if this helps for
> > > non-nohz_full CPUs, adjustments to the kernel might be called for.
I did not try this yet. But I am thinking why would this help in nohz_idle
case? In nohz_idle we already have the tick active when CPU is idle. I guess
it is because there may be a long time that elapses before
rcu_data.rcu_need_heavy_qs == true ?
> > Ok, will try it.
> >
> > Save these bullet points for future reference! ;-) thanks,
>
> I guess this is helping me to prepare for Plumbers. ;-)
:-)
thanks, Paul!
- Joel
On Fri, Aug 09, 2019 at 04:22:26PM -0400, Joel Fernandes wrote:
> On Fri, Aug 09, 2019 at 09:33:46AM -0700, Paul E. McKenney wrote:
> > On Fri, Aug 09, 2019 at 11:39:24AM -0400, Joel Fernandes wrote:
> > > On Fri, Aug 09, 2019 at 08:16:19AM -0700, Paul E. McKenney wrote:
> > > > On Thu, Aug 08, 2019 at 07:30:14PM -0400, Joel Fernandes wrote:
> > > [snip]
> > > > > > But I could make it something like:
> > > > > > 1. Letting ->head grow if ->head_free busy
> > > > > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> > > > > >
> > > > > > This would even improve performance, but will still risk going out of memory.
> > > > >
> > > > > It seems I can indeed hit an out of memory condition once I changed it to
> > > > > "letting list grow" (diff is below which applies on top of this patch) while
> > > > > at the same time removing the schedule_timeout(2) and replacing it with
> > > > > cond_resched() in the rcuperf test. I think the reason is the rcuperf test
> > > > > starves the worker threads that are executing in workqueue context after a
> > > > > grace period and those are unable to get enough CPU time to kfree things fast
> > > > > enough. But I am not fully sure about it and need to test/trace more to
> > > > > figure out why this is happening.
> > > > >
> > > > > If I add back the schedule_uninterruptibe_timeout(2) call, the out of memory
> > > > > situation goes away.
> > > > >
> > > > > Clearly we need to do more work on this patch.
> > > > >
> > > > > In the regular kfree_rcu_no_batch() case, I don't hit this issue. I believe
> > > > > that since the kfree is happening in softirq context in the _no_batch() case,
> > > > > it fares better. The question then I guess is how do we run the rcu_work in a
> > > > > higher priority context so it is not starved and runs often enough. I'll
> > > > > trace more.
> > > > >
> > > > > Perhaps I can also lower the priority of the rcuperf threads to give the
> > > > > worker thread some more room to run and see if anything changes. But I am not
> > > > > sure then if we're preparing the code for the real world with such
> > > > > modifications.
> > > > >
> > > > > Any thoughts?
> > > >
> > > > Several! With luck, perhaps some are useful. ;-)
> > > >
> > > > o Increase the memory via kvm.sh "--memory 1G" or more. The
> > > > default is "--memory 500M".
> > >
> > > Thanks, this definitely helped.
>
> Also, I can go back to 500M if I just keep KFREE_DRAIN_JIFFIES at HZ/50. So I
> am quite happy about that. I think I can declare that the "let list grow
> indefinitely" design works quite well even with an insanely heavily loaded
> case of every CPU in a 16CPU system with 500M memory, indefinitely doing
> kfree_rcu()in a tight loop with appropriate cond_resched(). And I am like
> thinking - wow how does this stuff even work at such insane scales :-D
Oh, and I should probably also count whether there are any 'total number of
grace periods' reduction, due to the batching!
thanks,
- Joel
On Fri, Aug 09, 2019 at 04:22:26PM -0400, Joel Fernandes wrote:
> > > > o With any of the above, invoke rcu_momentary_dyntick_idle() along
> > > > with cond_resched() in your kfree_rcu() loop. This simulates
> > > > a trip to userspace for nohz_full CPUs, so if this helps for
> > > > non-nohz_full CPUs, adjustments to the kernel might be called for.
>
> I did not try this yet. But I am thinking why would this help in nohz_idle
> case? In nohz_idle we already have the tick active when CPU is idle. I guess
> it is because there may be a long time that elapses before
> rcu_data.rcu_need_heavy_qs == true ?
Sorry, here I meant 'tick active when CPU is not idle'.
thanks,
- Joel
On Fri, Aug 09, 2019 at 04:22:26PM -0400, Joel Fernandes wrote:
> On Fri, Aug 09, 2019 at 09:33:46AM -0700, Paul E. McKenney wrote:
> > On Fri, Aug 09, 2019 at 11:39:24AM -0400, Joel Fernandes wrote:
> > > On Fri, Aug 09, 2019 at 08:16:19AM -0700, Paul E. McKenney wrote:
> > > > On Thu, Aug 08, 2019 at 07:30:14PM -0400, Joel Fernandes wrote:
> > > [snip]
> > > > > > But I could make it something like:
> > > > > > 1. Letting ->head grow if ->head_free busy
> > > > > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> > > > > >
> > > > > > This would even improve performance, but will still risk going out of memory.
> > > > >
> > > > > It seems I can indeed hit an out of memory condition once I changed it to
> > > > > "letting list grow" (diff is below which applies on top of this patch) while
> > > > > at the same time removing the schedule_timeout(2) and replacing it with
> > > > > cond_resched() in the rcuperf test. I think the reason is the rcuperf test
> > > > > starves the worker threads that are executing in workqueue context after a
> > > > > grace period and those are unable to get enough CPU time to kfree things fast
> > > > > enough. But I am not fully sure about it and need to test/trace more to
> > > > > figure out why this is happening.
> > > > >
> > > > > If I add back the schedule_uninterruptibe_timeout(2) call, the out of memory
> > > > > situation goes away.
> > > > >
> > > > > Clearly we need to do more work on this patch.
> > > > >
> > > > > In the regular kfree_rcu_no_batch() case, I don't hit this issue. I believe
> > > > > that since the kfree is happening in softirq context in the _no_batch() case,
> > > > > it fares better. The question then I guess is how do we run the rcu_work in a
> > > > > higher priority context so it is not starved and runs often enough. I'll
> > > > > trace more.
> > > > >
> > > > > Perhaps I can also lower the priority of the rcuperf threads to give the
> > > > > worker thread some more room to run and see if anything changes. But I am not
> > > > > sure then if we're preparing the code for the real world with such
> > > > > modifications.
> > > > >
> > > > > Any thoughts?
> > > >
> > > > Several! With luck, perhaps some are useful. ;-)
> > > >
> > > > o Increase the memory via kvm.sh "--memory 1G" or more. The
> > > > default is "--memory 500M".
> > >
> > > Thanks, this definitely helped.
>
> Also, I can go back to 500M if I just keep KFREE_DRAIN_JIFFIES at HZ/50. So I
> am quite happy about that. I think I can declare that the "let list grow
> indefinitely" design works quite well even with an insanely heavily loaded
> case of every CPU in a 16CPU system with 500M memory, indefinitely doing
> kfree_rcu()in a tight loop with appropriate cond_resched(). And I am like
> thinking - wow how does this stuff even work at such insane scales :-D
A lot of work by a lot of people over a long period of time. On their
behalf, I thank you for the implied compliment. So once this patch gets
in, perhaps you will have complimented yourself as well. ;-)
But more work is needed, and will continue to be as new workloads,
compiler optimizations, and hardware appears. And it would be good to
try this on a really big system at some point.
> > > > o Leave a CPU free to run things like the RCU grace-period kthread.
> > > > You might also need to bind that kthread to that CPU.
> > > >
> > > > o Alternatively, use the "rcutree.kthread_prio=" boot parameter to
> > > > boost the RCU kthreads to real-time priority. This won't do
> > > > anything for ksoftirqd, though.
> > >
> > > I will try these as well.
>
> kthread_prio=50 definitely reduced the probability of OOM but it still
> occurred.
OK, interesting.
> > > > o Along with the above boot parameter, use "rcutree.use_softirq=0"
> > > > to cause RCU to use kthreads instead of softirq. (You might well
> > > > find issues in priority setting as well, but might as well find
> > > > them now if so!)
> > >
> > > Doesn't think one actually reduce the priority of the core RCU work? softirq
> > > will always have higher priority than any there. So wouldn't that have the
> > > effect of not reclaiming things fast enough? (Or, in my case not scheduling
> > > the rcu_work which does the reclaim).
> >
> > For low kfree_rcu() loads, yes, it increases overhead due to the need
> > for context switches instead of softirq running at the tail end of an
> > interrupt. But for high kfree_rcu() loads, it gets you realtime priority
> > (in conjunction with "rcutree.kthread_prio=", that is).
>
> I meant for high kfree_rcu() loads, a softirq context executing RCU callback
> is still better from the point of view of the callback running because the
> softirq will run above all else (higher than the highest priority task) so
> use_softirq=0 would be a down grade from that perspective if something higher
> than rcutree.kthread_prio is running on the CPU. So unless kthread_prio is
> set to the highest prio, then softirq running would work better. Did I miss
> something?
Under heavy load, softirq stops running at the tail end of interrupts and
is instead run within the context of a per-CPU ksoftirqd kthread. At normal
SCHED_OTHER priority.
> > > > o With any of the above, invoke rcu_momentary_dyntick_idle() along
> > > > with cond_resched() in your kfree_rcu() loop. This simulates
> > > > a trip to userspace for nohz_full CPUs, so if this helps for
> > > > non-nohz_full CPUs, adjustments to the kernel might be called for.
>
> I did not try this yet. But I am thinking why would this help in nohz_idle
> case? In nohz_idle we already have the tick active when CPU is idle. I guess
> it is because there may be a long time that elapses before
> rcu_data.rcu_need_heavy_qs == true ?
Under your heavy rcuperf load, none of the CPUs would ever be idle. Nor
would they every be in nohz_full userspace context, either.
In contrast, a heavy duty userspace-driven workload would transition to
and from userspace for each kfree_rcu(), and that would increment the
dyntick-idle count on each transition to and from userspace. Adding the
rcu_momentary_dyntick_idle() emulates a pair of such transitions.
Thanx, Paul
> > > Ok, will try it.
> > >
> > > Save these bullet points for future reference! ;-) thanks,
> >
> > I guess this is helping me to prepare for Plumbers. ;-)
>
> :-)
>
> thanks, Paul!
>
> - Joel
>
On Fri, Aug 09, 2019 at 04:26:45PM -0400, Joel Fernandes wrote:
> On Fri, Aug 09, 2019 at 04:22:26PM -0400, Joel Fernandes wrote:
> > On Fri, Aug 09, 2019 at 09:33:46AM -0700, Paul E. McKenney wrote:
> > > On Fri, Aug 09, 2019 at 11:39:24AM -0400, Joel Fernandes wrote:
> > > > On Fri, Aug 09, 2019 at 08:16:19AM -0700, Paul E. McKenney wrote:
> > > > > On Thu, Aug 08, 2019 at 07:30:14PM -0400, Joel Fernandes wrote:
> > > > [snip]
> > > > > > > But I could make it something like:
> > > > > > > 1. Letting ->head grow if ->head_free busy
> > > > > > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> > > > > > >
> > > > > > > This would even improve performance, but will still risk going out of memory.
> > > > > >
> > > > > > It seems I can indeed hit an out of memory condition once I changed it to
> > > > > > "letting list grow" (diff is below which applies on top of this patch) while
> > > > > > at the same time removing the schedule_timeout(2) and replacing it with
> > > > > > cond_resched() in the rcuperf test. I think the reason is the rcuperf test
> > > > > > starves the worker threads that are executing in workqueue context after a
> > > > > > grace period and those are unable to get enough CPU time to kfree things fast
> > > > > > enough. But I am not fully sure about it and need to test/trace more to
> > > > > > figure out why this is happening.
> > > > > >
> > > > > > If I add back the schedule_uninterruptibe_timeout(2) call, the out of memory
> > > > > > situation goes away.
> > > > > >
> > > > > > Clearly we need to do more work on this patch.
> > > > > >
> > > > > > In the regular kfree_rcu_no_batch() case, I don't hit this issue. I believe
> > > > > > that since the kfree is happening in softirq context in the _no_batch() case,
> > > > > > it fares better. The question then I guess is how do we run the rcu_work in a
> > > > > > higher priority context so it is not starved and runs often enough. I'll
> > > > > > trace more.
> > > > > >
> > > > > > Perhaps I can also lower the priority of the rcuperf threads to give the
> > > > > > worker thread some more room to run and see if anything changes. But I am not
> > > > > > sure then if we're preparing the code for the real world with such
> > > > > > modifications.
> > > > > >
> > > > > > Any thoughts?
> > > > >
> > > > > Several! With luck, perhaps some are useful. ;-)
> > > > >
> > > > > o Increase the memory via kvm.sh "--memory 1G" or more. The
> > > > > default is "--memory 500M".
> > > >
> > > > Thanks, this definitely helped.
> >
> > Also, I can go back to 500M if I just keep KFREE_DRAIN_JIFFIES at HZ/50. So I
> > am quite happy about that. I think I can declare that the "let list grow
> > indefinitely" design works quite well even with an insanely heavily loaded
> > case of every CPU in a 16CPU system with 500M memory, indefinitely doing
> > kfree_rcu()in a tight loop with appropriate cond_resched(). And I am like
> > thinking - wow how does this stuff even work at such insane scales :-D
>
> Oh, and I should probably also count whether there are any 'total number of
> grace periods' reduction, due to the batching!
And, the number of grace periods did dramatically drop (by 5X) with the
batching!! I have modified the rcuperf test to show the number of grace
periods that elapsed during the test.
thanks,
- Joel
On Fri, Aug 09, 2019 at 01:42:17PM -0700, Paul E. McKenney wrote:
> > Also, I can go back to 500M if I just keep KFREE_DRAIN_JIFFIES at HZ/50. So I
> > am quite happy about that. I think I can declare that the "let list grow
> > indefinitely" design works quite well even with an insanely heavily loaded
> > case of every CPU in a 16CPU system with 500M memory, indefinitely doing
> > kfree_rcu()in a tight loop with appropriate cond_resched(). And I am like
> > thinking - wow how does this stuff even work at such insane scales :-D
>
> A lot of work by a lot of people over a long period of time. On their
> behalf, I thank you for the implied compliment. So once this patch gets
> in, perhaps you will have complimented yourself as well. ;-)
>
> But more work is needed, and will continue to be as new workloads,
> compiler optimizations, and hardware appears. And it would be good to
> try this on a really big system at some point.
Cool!
> > > > > o Along with the above boot parameter, use "rcutree.use_softirq=0"
> > > > > to cause RCU to use kthreads instead of softirq. (You might well
> > > > > find issues in priority setting as well, but might as well find
> > > > > them now if so!)
> > > >
> > > > Doesn't think one actually reduce the priority of the core RCU work? softirq
> > > > will always have higher priority than any there. So wouldn't that have the
> > > > effect of not reclaiming things fast enough? (Or, in my case not scheduling
> > > > the rcu_work which does the reclaim).
> > >
> > > For low kfree_rcu() loads, yes, it increases overhead due to the need
> > > for context switches instead of softirq running at the tail end of an
> > > interrupt. But for high kfree_rcu() loads, it gets you realtime priority
> > > (in conjunction with "rcutree.kthread_prio=", that is).
> >
> > I meant for high kfree_rcu() loads, a softirq context executing RCU callback
> > is still better from the point of view of the callback running because the
> > softirq will run above all else (higher than the highest priority task) so
> > use_softirq=0 would be a down grade from that perspective if something higher
> > than rcutree.kthread_prio is running on the CPU. So unless kthread_prio is
> > set to the highest prio, then softirq running would work better. Did I miss
> > something?
>
> Under heavy load, softirq stops running at the tail end of interrupts and
> is instead run within the context of a per-CPU ksoftirqd kthread. At normal
> SCHED_OTHER priority.
Ah, yes. Agreed!
> > > > > o With any of the above, invoke rcu_momentary_dyntick_idle() along
> > > > > with cond_resched() in your kfree_rcu() loop. This simulates
> > > > > a trip to userspace for nohz_full CPUs, so if this helps for
> > > > > non-nohz_full CPUs, adjustments to the kernel might be called for.
> >
> > I did not try this yet. But I am thinking why would this help in nohz_idle
> > case? In nohz_idle we already have the tick active when CPU is idle. I guess
> > it is because there may be a long time that elapses before
> > rcu_data.rcu_need_heavy_qs == true ?
>
> Under your heavy rcuperf load, none of the CPUs would ever be idle. Nor
> would they every be in nohz_full userspace context, either.
Sorry I made a typo, I meant 'tick active when CPU is non-idle for NOHZ_IDLE
systems' above.
> In contrast, a heavy duty userspace-driven workload would transition to
> and from userspace for each kfree_rcu(), and that would increment the
> dyntick-idle count on each transition to and from userspace. Adding the
> rcu_momentary_dyntick_idle() emulates a pair of such transitions.
But even if we're in kernel mode and not transitioning, I thought the FQS
loop (rcu_implicit_dynticks_qs() function) would set need_heavy_qs to true at
2 * jiffies_to_sched_qs.
Hmm, I forgot that jiffies_to_sched_qs can be quite large I guess. You're
right, we could call rcu_momentary_dyntick_idle() in advance before waiting
for FQS loop to do the setting of need_heavy_qs.
Or, am I missing something with the rcu_momentary_dyntick_idle() point you
made?
thanks,
- Joel
>
> Thanx, Paul
>
> > > > Ok, will try it.
> > > >
> > > > Save these bullet points for future reference! ;-) thanks,
> > >
> > > I guess this is helping me to prepare for Plumbers. ;-)
> >
> > :-)
> >
> > thanks, Paul!
> >
> > - Joel
> >
>
On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
[snip]
> > > > @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> > > > {
> > > > int cpu;
> > > >
> > > > + kfree_rcu_batch_init();
> > >
> > > What happens if someone does a kfree_rcu() before this point? It looks
> > > like it should work, but have you tested it?
> > >
> > > > rcu_early_boot_tests();
> > >
> > > For example, by testing it in rcu_early_boot_tests() and moving the
> > > call to kfree_rcu_batch_init() here.
> >
> > I have not tried to do the kfree_rcu() this early. I will try it out.
>
> Yeah, well, call_rcu() this early came as a surprise to me back in the
> day, so... ;-)
I actually did get surprised as well!
It appears the timers are not fully initialized so the really early
kfree_rcu() call from rcu_init() does cause a splat about an initialized
timer spinlock (even though future kfree_rcu()s and the system are working
fine all the way into the torture tests).
I think to resolve this, we can just not do batching until early_initcall,
during which I have an initialization function which switches batching on.
From that point it is safe.
Below is the diff on top of this patch, I think this should be good but let
me know if anything looks odd to you. I tested it and it works.
have a great weekend! thanks,
-Joel
---8<-----------------------
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a09ef81a1a4f..358f5c065fa4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2634,6 +2634,7 @@ struct kfree_rcu_cpu {
};
static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+int kfree_rcu_batching_ready;
/*
* This function is invoked in workqueue context after a grace period.
@@ -2742,6 +2743,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
spin_unlock_irqrestore(&krcp->lock, flags);
}
+/*
+ * This version of kfree_call_rcu does not do batching of kfree_rcu() requests.
+ * Used only by rcuperf torture test for comparison with kfree_rcu_batch()
+ * or during really early init.
+ */
+void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
+{
+ __call_rcu(head, func, -1, 1);
+}
+EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch);
+
/*
* Queue a request for lazy invocation of kfree() after a grace period.
*
@@ -2764,6 +2775,10 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
unsigned long flags;
struct kfree_rcu_cpu *krcp;
bool monitor_todo;
+ static int once;
+
+ if (!READ_ONCE(kfree_rcu_batching_ready))
+ return kfree_call_rcu_nobatch(head, func);
local_irq_save(flags);
krcp = this_cpu_ptr(&krc);
@@ -2794,16 +2809,6 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);
-/*
- * This version of kfree_call_rcu does not do batching of kfree_rcu() requests.
- * Used only by rcuperf torture test for comparison with kfree_rcu_batch().
- */
-void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
-{
- __call_rcu(head, func, -1, 1);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch);
-
/*
* During early boot, any blocking grace-period wait automatically
* implies a grace period. Later on, this is never the case for PREEMPT.
@@ -3650,17 +3655,6 @@ static void __init rcu_dump_rcu_node_tree(void)
pr_cont("\n");
}
-void kfree_rcu_batch_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- spin_lock_init(&krcp->lock);
- INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
- }
-}
-
struct workqueue_struct *rcu_gp_wq;
struct workqueue_struct *rcu_par_gp_wq;
@@ -3668,8 +3662,6 @@ void __init rcu_init(void)
{
int cpu;
- kfree_rcu_batch_init();
-
rcu_early_boot_tests();
rcu_bootup_announce();
@@ -3700,6 +3692,21 @@ void __init rcu_init(void)
srcu_init();
}
+static int __init kfree_rcu_batch_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+ spin_lock_init(&krcp->lock);
+ INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+ }
+
+ WRITE_ONCE(kfree_rcu_batching_ready, 1);
+ return 0;
+}
+early_initcall(kfree_rcu_batch_init);
+
#include "tree_stall.h"
#include "tree_exp.h"
#include "tree_plugin.h"
On Fri, Aug 09, 2019 at 10:42:32PM -0400, Joel Fernandes wrote:
> On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> [snip]
> > > > > @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> > > > > {
> > > > > int cpu;
> > > > >
> > > > > + kfree_rcu_batch_init();
> > > >
> > > > What happens if someone does a kfree_rcu() before this point? It looks
> > > > like it should work, but have you tested it?
> > > >
> > > > > rcu_early_boot_tests();
> > > >
> > > > For example, by testing it in rcu_early_boot_tests() and moving the
> > > > call to kfree_rcu_batch_init() here.
> > >
> > > I have not tried to do the kfree_rcu() this early. I will try it out.
> >
> > Yeah, well, call_rcu() this early came as a surprise to me back in the
> > day, so... ;-)
>
> I actually did get surprised as well!
>
> It appears the timers are not fully initialized so the really early
> kfree_rcu() call from rcu_init() does cause a splat about an initialized
> timer spinlock (even though future kfree_rcu()s and the system are working
> fine all the way into the torture tests).
>
> I think to resolve this, we can just not do batching until early_initcall,
> during which I have an initialization function which switches batching on.
> >From that point it is safe.
Just go ahead and batch, but don't bother with the timer until
after single-threaded boot is done. For example, you could check
rcu_scheduler_active similar to how sync_rcu_exp_select_cpus() does.
(See kernel/rcu/tree_exp.h.)
If needed, use an early_initcall() to handle the case where early boot
kfree_rcu() calls needed to set the timer but could not.
> Below is the diff on top of this patch, I think this should be good but let
> me know if anything looks odd to you. I tested it and it works.
Keep in mind that a call_rcu() callback can't possibly be invoked until
quite some time after the scheduler is up and running. So it will be
a lot simpler to just skip setting the timer during early boot.
Thanx, Paul
> have a great weekend! thanks,
> -Joel
>
> ---8<-----------------------
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index a09ef81a1a4f..358f5c065fa4 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2634,6 +2634,7 @@ struct kfree_rcu_cpu {
> };
>
> static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
> +int kfree_rcu_batching_ready;
>
> /*
> * This function is invoked in workqueue context after a grace period.
> @@ -2742,6 +2743,17 @@ static void kfree_rcu_monitor(struct work_struct *work)
> spin_unlock_irqrestore(&krcp->lock, flags);
> }
>
> +/*
> + * This version of kfree_call_rcu does not do batching of kfree_rcu() requests.
> + * Used only by rcuperf torture test for comparison with kfree_rcu_batch()
> + * or during really early init.
> + */
> +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
> +{
> + __call_rcu(head, func, -1, 1);
> +}
> +EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch);
> +
> /*
> * Queue a request for lazy invocation of kfree() after a grace period.
> *
> @@ -2764,6 +2775,10 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> unsigned long flags;
> struct kfree_rcu_cpu *krcp;
> bool monitor_todo;
> + static int once;
> +
> + if (!READ_ONCE(kfree_rcu_batching_ready))
> + return kfree_call_rcu_nobatch(head, func);
>
> local_irq_save(flags);
> krcp = this_cpu_ptr(&krc);
> @@ -2794,16 +2809,6 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> }
> EXPORT_SYMBOL_GPL(kfree_call_rcu);
>
> -/*
> - * This version of kfree_call_rcu does not do batching of kfree_rcu() requests.
> - * Used only by rcuperf torture test for comparison with kfree_rcu_batch().
> - */
> -void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
> -{
> - __call_rcu(head, func, -1, 1);
> -}
> -EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch);
> -
> /*
> * During early boot, any blocking grace-period wait automatically
> * implies a grace period. Later on, this is never the case for PREEMPT.
> @@ -3650,17 +3655,6 @@ static void __init rcu_dump_rcu_node_tree(void)
> pr_cont("\n");
> }
>
> -void kfree_rcu_batch_init(void)
> -{
> - int cpu;
> -
> - for_each_possible_cpu(cpu) {
> - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> - spin_lock_init(&krcp->lock);
> - INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> - }
> -}
> -
> struct workqueue_struct *rcu_gp_wq;
> struct workqueue_struct *rcu_par_gp_wq;
>
> @@ -3668,8 +3662,6 @@ void __init rcu_init(void)
> {
> int cpu;
>
> - kfree_rcu_batch_init();
> -
> rcu_early_boot_tests();
>
> rcu_bootup_announce();
> @@ -3700,6 +3692,21 @@ void __init rcu_init(void)
> srcu_init();
> }
>
> +static int __init kfree_rcu_batch_init(void)
> +{
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> + spin_lock_init(&krcp->lock);
> + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> + }
> +
> + WRITE_ONCE(kfree_rcu_batching_ready, 1);
> + return 0;
> +}
> +early_initcall(kfree_rcu_batch_init);
> +
> #include "tree_stall.h"
> #include "tree_exp.h"
> #include "tree_plugin.h"
On Fri, Aug 09, 2019 at 05:25:12PM -0400, Joel Fernandes wrote:
> On Fri, Aug 09, 2019 at 04:26:45PM -0400, Joel Fernandes wrote:
> > On Fri, Aug 09, 2019 at 04:22:26PM -0400, Joel Fernandes wrote:
> > > On Fri, Aug 09, 2019 at 09:33:46AM -0700, Paul E. McKenney wrote:
> > > > On Fri, Aug 09, 2019 at 11:39:24AM -0400, Joel Fernandes wrote:
> > > > > On Fri, Aug 09, 2019 at 08:16:19AM -0700, Paul E. McKenney wrote:
> > > > > > On Thu, Aug 08, 2019 at 07:30:14PM -0400, Joel Fernandes wrote:
> > > > > [snip]
> > > > > > > > But I could make it something like:
> > > > > > > > 1. Letting ->head grow if ->head_free busy
> > > > > > > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> > > > > > > >
> > > > > > > > This would even improve performance, but will still risk going out of memory.
> > > > > > >
> > > > > > > It seems I can indeed hit an out of memory condition once I changed it to
> > > > > > > "letting list grow" (diff is below which applies on top of this patch) while
> > > > > > > at the same time removing the schedule_timeout(2) and replacing it with
> > > > > > > cond_resched() in the rcuperf test. I think the reason is the rcuperf test
> > > > > > > starves the worker threads that are executing in workqueue context after a
> > > > > > > grace period and those are unable to get enough CPU time to kfree things fast
> > > > > > > enough. But I am not fully sure about it and need to test/trace more to
> > > > > > > figure out why this is happening.
> > > > > > >
> > > > > > > If I add back the schedule_uninterruptibe_timeout(2) call, the out of memory
> > > > > > > situation goes away.
> > > > > > >
> > > > > > > Clearly we need to do more work on this patch.
> > > > > > >
> > > > > > > In the regular kfree_rcu_no_batch() case, I don't hit this issue. I believe
> > > > > > > that since the kfree is happening in softirq context in the _no_batch() case,
> > > > > > > it fares better. The question then I guess is how do we run the rcu_work in a
> > > > > > > higher priority context so it is not starved and runs often enough. I'll
> > > > > > > trace more.
> > > > > > >
> > > > > > > Perhaps I can also lower the priority of the rcuperf threads to give the
> > > > > > > worker thread some more room to run and see if anything changes. But I am not
> > > > > > > sure then if we're preparing the code for the real world with such
> > > > > > > modifications.
> > > > > > >
> > > > > > > Any thoughts?
> > > > > >
> > > > > > Several! With luck, perhaps some are useful. ;-)
> > > > > >
> > > > > > o Increase the memory via kvm.sh "--memory 1G" or more. The
> > > > > > default is "--memory 500M".
> > > > >
> > > > > Thanks, this definitely helped.
> > >
> > > Also, I can go back to 500M if I just keep KFREE_DRAIN_JIFFIES at HZ/50. So I
> > > am quite happy about that. I think I can declare that the "let list grow
> > > indefinitely" design works quite well even with an insanely heavily loaded
> > > case of every CPU in a 16CPU system with 500M memory, indefinitely doing
> > > kfree_rcu()in a tight loop with appropriate cond_resched(). And I am like
> > > thinking - wow how does this stuff even work at such insane scales :-D
> >
> > Oh, and I should probably also count whether there are any 'total number of
> > grace periods' reduction, due to the batching!
>
> And, the number of grace periods did dramatically drop (by 5X) with the
> batching!! I have modified the rcuperf test to show the number of grace
> periods that elapsed during the test.
Very good! Batching for the win! ;-)
Thanx, Paul
On Fri, Aug 09, 2019 at 05:36:43PM -0400, Joel Fernandes wrote:
> On Fri, Aug 09, 2019 at 01:42:17PM -0700, Paul E. McKenney wrote:
> > > Also, I can go back to 500M if I just keep KFREE_DRAIN_JIFFIES at HZ/50. So I
> > > am quite happy about that. I think I can declare that the "let list grow
> > > indefinitely" design works quite well even with an insanely heavily loaded
> > > case of every CPU in a 16CPU system with 500M memory, indefinitely doing
> > > kfree_rcu()in a tight loop with appropriate cond_resched(). And I am like
> > > thinking - wow how does this stuff even work at such insane scales :-D
> >
> > A lot of work by a lot of people over a long period of time. On their
> > behalf, I thank you for the implied compliment. So once this patch gets
> > in, perhaps you will have complimented yourself as well. ;-)
> >
> > But more work is needed, and will continue to be as new workloads,
> > compiler optimizations, and hardware appears. And it would be good to
> > try this on a really big system at some point.
>
> Cool!
>
> > > > > > o Along with the above boot parameter, use "rcutree.use_softirq=0"
> > > > > > to cause RCU to use kthreads instead of softirq. (You might well
> > > > > > find issues in priority setting as well, but might as well find
> > > > > > them now if so!)
> > > > >
> > > > > Doesn't think one actually reduce the priority of the core RCU work? softirq
> > > > > will always have higher priority than any there. So wouldn't that have the
> > > > > effect of not reclaiming things fast enough? (Or, in my case not scheduling
> > > > > the rcu_work which does the reclaim).
> > > >
> > > > For low kfree_rcu() loads, yes, it increases overhead due to the need
> > > > for context switches instead of softirq running at the tail end of an
> > > > interrupt. But for high kfree_rcu() loads, it gets you realtime priority
> > > > (in conjunction with "rcutree.kthread_prio=", that is).
> > >
> > > I meant for high kfree_rcu() loads, a softirq context executing RCU callback
> > > is still better from the point of view of the callback running because the
> > > softirq will run above all else (higher than the highest priority task) so
> > > use_softirq=0 would be a down grade from that perspective if something higher
> > > than rcutree.kthread_prio is running on the CPU. So unless kthread_prio is
> > > set to the highest prio, then softirq running would work better. Did I miss
> > > something?
> >
> > Under heavy load, softirq stops running at the tail end of interrupts and
> > is instead run within the context of a per-CPU ksoftirqd kthread. At normal
> > SCHED_OTHER priority.
>
> Ah, yes. Agreed!
>
> > > > > > o With any of the above, invoke rcu_momentary_dyntick_idle() along
> > > > > > with cond_resched() in your kfree_rcu() loop. This simulates
> > > > > > a trip to userspace for nohz_full CPUs, so if this helps for
> > > > > > non-nohz_full CPUs, adjustments to the kernel might be called for.
> > >
> > > I did not try this yet. But I am thinking why would this help in nohz_idle
> > > case? In nohz_idle we already have the tick active when CPU is idle. I guess
> > > it is because there may be a long time that elapses before
> > > rcu_data.rcu_need_heavy_qs == true ?
> >
> > Under your heavy rcuperf load, none of the CPUs would ever be idle. Nor
> > would they every be in nohz_full userspace context, either.
>
> Sorry I made a typo, I meant 'tick active when CPU is non-idle for NOHZ_IDLE
> systems' above.
>
> > In contrast, a heavy duty userspace-driven workload would transition to
> > and from userspace for each kfree_rcu(), and that would increment the
> > dyntick-idle count on each transition to and from userspace. Adding the
> > rcu_momentary_dyntick_idle() emulates a pair of such transitions.
>
> But even if we're in kernel mode and not transitioning, I thought the FQS
> loop (rcu_implicit_dynticks_qs() function) would set need_heavy_qs to true at
> 2 * jiffies_to_sched_qs.
>
> Hmm, I forgot that jiffies_to_sched_qs can be quite large I guess. You're
> right, we could call rcu_momentary_dyntick_idle() in advance before waiting
> for FQS loop to do the setting of need_heavy_qs.
>
> Or, am I missing something with the rcu_momentary_dyntick_idle() point you
> made?
The trick is that rcu_momentary_dyntick_idle() directly increments the
CPU's dyntick counter, so that the next FQS loop will note that the CPU
passed through a quiescent state. No need for need_heavy_qs in this case.
Thanx, Paul
> thanks,
>
> - Joel
>
>
> >
> > Thanx, Paul
> >
> > > > > Ok, will try it.
> > > > >
> > > > > Save these bullet points for future reference! ;-) thanks,
> > > >
> > > > I guess this is helping me to prepare for Plumbers. ;-)
> > >
> > > :-)
> > >
> > > thanks, Paul!
> > >
> > > - Joel
> > >
> >
>
On Fri, Aug 09, 2019 at 08:40:27PM -0700, Paul E. McKenney wrote:
[snip]
> > > In contrast, a heavy duty userspace-driven workload would transition to
> > > and from userspace for each kfree_rcu(), and that would increment the
> > > dyntick-idle count on each transition to and from userspace. Adding the
> > > rcu_momentary_dyntick_idle() emulates a pair of such transitions.
> >
> > But even if we're in kernel mode and not transitioning, I thought the FQS
> > loop (rcu_implicit_dynticks_qs() function) would set need_heavy_qs to true at
> > 2 * jiffies_to_sched_qs.
> >
> > Hmm, I forgot that jiffies_to_sched_qs can be quite large I guess. You're
> > right, we could call rcu_momentary_dyntick_idle() in advance before waiting
> > for FQS loop to do the setting of need_heavy_qs.
> >
> > Or, am I missing something with the rcu_momentary_dyntick_idle() point you
> > made?
>
> The trick is that rcu_momentary_dyntick_idle() directly increments the
> CPU's dyntick counter, so that the next FQS loop will note that the CPU
> passed through a quiescent state. No need for need_heavy_qs in this case.
Yes, that's what I also understand. Thanks for confirming,
- Joel
On Fri, Aug 09, 2019 at 08:38:14PM -0700, Paul E. McKenney wrote:
> On Fri, Aug 09, 2019 at 10:42:32PM -0400, Joel Fernandes wrote:
> > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > [snip]
> > > > > > @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> > > > > > {
> > > > > > int cpu;
> > > > > >
> > > > > > + kfree_rcu_batch_init();
> > > > >
> > > > > What happens if someone does a kfree_rcu() before this point? It looks
> > > > > like it should work, but have you tested it?
> > > > >
> > > > > > rcu_early_boot_tests();
> > > > >
> > > > > For example, by testing it in rcu_early_boot_tests() and moving the
> > > > > call to kfree_rcu_batch_init() here.
> > > >
> > > > I have not tried to do the kfree_rcu() this early. I will try it out.
> > >
> > > Yeah, well, call_rcu() this early came as a surprise to me back in the
> > > day, so... ;-)
> >
> > I actually did get surprised as well!
> >
> > It appears the timers are not fully initialized so the really early
> > kfree_rcu() call from rcu_init() does cause a splat about an initialized
> > timer spinlock (even though future kfree_rcu()s and the system are working
> > fine all the way into the torture tests).
> >
> > I think to resolve this, we can just not do batching until early_initcall,
> > during which I have an initialization function which switches batching on.
> > >From that point it is safe.
>
> Just go ahead and batch, but don't bother with the timer until
> after single-threaded boot is done. For example, you could check
> rcu_scheduler_active similar to how sync_rcu_exp_select_cpus() does.
> (See kernel/rcu/tree_exp.h.)
Cool, that works nicely and I tested it. Actually I made it such that we
don't need to batch even, before the scheduler is up. I don't see any benefit
of that unless we can see a kfree_rcu() flood happening that early at boot
which seems highly doubtful as a real world case.
> If needed, use an early_initcall() to handle the case where early boot
> kfree_rcu() calls needed to set the timer but could not.
And it would also need this complexity of early_initcall.
> > Below is the diff on top of this patch, I think this should be good but let
> > me know if anything looks odd to you. I tested it and it works.
>
> Keep in mind that a call_rcu() callback can't possibly be invoked until
> quite some time after the scheduler is up and running. So it will be
> a lot simpler to just skip setting the timer during early boot.
Sure. Skipping batching would skip the timer too :-D
If in the future, batching is needed this early, then I am happy to add an
early_initcall to setup the timer for any batched calls that could not setup
the timer. Hope that is ok with you?
thanks,
- Joel
[snip]
On Sat, Aug 10, 2019 at 12:20:37AM -0400, Joel Fernandes wrote:
> On Fri, Aug 09, 2019 at 08:38:14PM -0700, Paul E. McKenney wrote:
> > On Fri, Aug 09, 2019 at 10:42:32PM -0400, Joel Fernandes wrote:
> > > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > [snip]
> > > > > > > @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> > > > > > > {
> > > > > > > int cpu;
> > > > > > >
> > > > > > > + kfree_rcu_batch_init();
> > > > > >
> > > > > > What happens if someone does a kfree_rcu() before this point? It looks
> > > > > > like it should work, but have you tested it?
> > > > > >
> > > > > > > rcu_early_boot_tests();
> > > > > >
> > > > > > For example, by testing it in rcu_early_boot_tests() and moving the
> > > > > > call to kfree_rcu_batch_init() here.
> > > > >
> > > > > I have not tried to do the kfree_rcu() this early. I will try it out.
> > > >
> > > > Yeah, well, call_rcu() this early came as a surprise to me back in the
> > > > day, so... ;-)
> > >
> > > I actually did get surprised as well!
> > >
> > > It appears the timers are not fully initialized so the really early
> > > kfree_rcu() call from rcu_init() does cause a splat about an initialized
> > > timer spinlock (even though future kfree_rcu()s and the system are working
> > > fine all the way into the torture tests).
> > >
> > > I think to resolve this, we can just not do batching until early_initcall,
> > > during which I have an initialization function which switches batching on.
> > > >From that point it is safe.
> >
> > Just go ahead and batch, but don't bother with the timer until
> > after single-threaded boot is done. For example, you could check
> > rcu_scheduler_active similar to how sync_rcu_exp_select_cpus() does.
> > (See kernel/rcu/tree_exp.h.)
>
> Cool, that works nicely and I tested it. Actually I made it such that we
> don't need to batch even, before the scheduler is up. I don't see any benefit
> of that unless we can see a kfree_rcu() flood happening that early at boot
> which seems highly doubtful as a real world case.
The benefit is removing the kfree_rcu() special cases from the innards
of RCU, for example, in rcu_do_batch(). Another benefit is removing the
current restriction on the position of the rcu_head structure within the
enclosing data structure.
So it would be good to avoid the current kfree_rcu() special casing within
RCU itself.
Or are you using some trick that avoids both the batching and the current
kfree_rcu() special casing?
> > If needed, use an early_initcall() to handle the case where early boot
> > kfree_rcu() calls needed to set the timer but could not.
>
> And it would also need this complexity of early_initcall.
It would, but that (1) should not be all that complex, (2) only executes
the one time at boot rather than being an extra check on each callback,
and (3) is in memory that can be reclaimed (though I confess that I am
not sure how many architectures still do this).
> > > Below is the diff on top of this patch, I think this should be good but let
> > > me know if anything looks odd to you. I tested it and it works.
> >
> > Keep in mind that a call_rcu() callback can't possibly be invoked until
> > quite some time after the scheduler is up and running. So it will be
> > a lot simpler to just skip setting the timer during early boot.
>
> Sure. Skipping batching would skip the timer too :-D
Yes, but if I understand your approach correctly, it is unfortunately
not managing to avoid getting rid of the kfree_rcu() special casing.
> If in the future, batching is needed this early, then I am happy to add an
> early_initcall to setup the timer for any batched calls that could not setup
> the timer. Hope that is ok with you?
If you have some trick to nevertheless get rid of the current kfree_rcu()
special casing within RCU proper, sure!
Thanx, Paul
On Tue, Aug 06, 2019 at 05:29:15PM -0700, Paul E. McKenney wrote:
> On Tue, Aug 06, 2019 at 05:20:41PM -0400, Joel Fernandes (Google) wrote:
> > This test runs kfree_rcu in a loop to measure performance of the new
> > kfree_rcu, with and without patch.
> >
> > To see improvement, run with boot parameters:
> > rcuperf.kfree_loops=2000 rcuperf.kfree_alloc_num=100 rcuperf.perf_type=kfree
> >
> > Without patch, test runs in 6.9 seconds.
> > With patch, test runs in 6.1 seconds (+13% improvement)
> >
> > If it is desired to run the test but with the traditional (non-batched)
> > kfree_rcu, for example to compare results, then you could pass along the
> > rcuperf.kfree_no_batch=1 boot parameter.
>
> You lost me on this one. You ran two runs, with rcuperf.kfree_no_batch=1
> and without? Or you ran this patch both with and without the earlier
> patch, and could have run with the patch and rcuperf.kfree_no_batch=1?
>
> If the latter, it would be good to try all three.
Did this in new patch, will post shortly.
[snip]
> > +torture_param(int, kfree_nthreads, -1, "Number of RCU reader threads");
> > +torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done by a thread");
> > +torture_param(int, kfree_alloc_size, 16, "Size of each allocation");
>
> Is this used? How does it relate to KFREE_OBJ_BYTES?
It is not used, I removed it.
> > +torture_param(int, kfree_loops, 10, "Size of each allocation");
>
> I suspect that this kfree_loops string is out of date.
Updated, thanks.
> > +torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu");
>
> All of these need to be added to kernel-parameters.txt. Along with
> any added by the earlier patch, for that matter.
Will do.
> > +static struct task_struct **kfree_reader_tasks;
> > +static int kfree_nrealthreads;
> > +static atomic_t n_kfree_perf_thread_started;
> > +static atomic_t n_kfree_perf_thread_ended;
> > +
> > +#define KFREE_OBJ_BYTES 8
> > +
> > +struct kfree_obj {
> > + char kfree_obj[KFREE_OBJ_BYTES];
> > + struct rcu_head rh;
> > +};
> > +
> > +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func);
> > +
> > +static int
> > +kfree_perf_thread(void *arg)
> > +{
> > + int i, l = 0;
>
> It is really easy to confuse "l" and "1" in some fonts, so please use
> a different name. (From the "showing my age" department: On typical
> 1970s typewriters, there was no numeral "1" -- you typed the letter
> "l" instead, thus anticipating at least the first digit of "1337".)
Change l to loops ;). I did see typewriters around in my childhood, I thought
they were pretty odd machines :-D. I am sure my daughter will think the same
about land-line phones :-D
> > + long me = (long)arg;
> > + struct kfree_obj **alloc_ptrs;
> > + u64 start_time, end_time;
> > +
> > + VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
> > + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
> > + set_user_nice(current, MAX_NICE);
> > + atomic_inc(&n_kfree_perf_thread_started);
> > +
> > + alloc_ptrs = (struct kfree_obj **)kmalloc(sizeof(struct kfree_obj *) * kfree_alloc_num,
> > + GFP_KERNEL);
> > + if (!alloc_ptrs)
> > + return -ENOMEM;
> > +
> > + start_time = ktime_get_mono_fast_ns();
>
> Don't you want to announce that you started here rather than above in
> order to avoid (admittedly slight) measurement inaccuracies?
Yes, in revised patch I am announcing here.
> > + do {
> > + for (i = 0; i < kfree_alloc_num; i++) {
> > + alloc_ptrs[i] = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
> > + if (!alloc_ptrs[i])
> > + return -ENOMEM;
> > + }
> > +
> > + for (i = 0; i < kfree_alloc_num; i++) {
> > + if (!kfree_no_batch) {
> > + kfree_rcu(alloc_ptrs[i], rh);
> > + } else {
> > + rcu_callback_t cb;
> > +
> > + cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh);
> > + kfree_call_rcu_nobatch(&(alloc_ptrs[i]->rh), cb);
> > + }
> > + }
> > +
> > + schedule_timeout_uninterruptible(2);
>
> Why the two-jiffy wait in the middle of a timed test? Yes, you need
> a cond_resched() and maybe more here, but a two-jiffy wait? I don't
> see how this has any chance of getting valid measurements.
>
> What am I missing here?
Replace it with cond_resched() as we discussed.
> > + } while (!torture_must_stop() && ++l < kfree_loops);
> > +
> > + kfree(alloc_ptrs);
> > +
> > + if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) {
> > + end_time = ktime_get_mono_fast_ns();
>
> Don't we want to capture the end time before the kfree()?
Fixed.
> > + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d\n",
> > + (unsigned long long)(end_time - start_time), kfree_loops);
> > + if (shutdown) {
> > + smp_mb(); /* Assign before wake. */
> > + wake_up(&shutdown_wq);
> > + }
> > + }
> > +
> > + torture_kthread_stopping("kfree_perf_thread");
> > + return 0;
> > +}
> > +
> > +static void
> > +kfree_perf_cleanup(void)
> > +{
> > + int i;
> > +
> > + if (torture_cleanup_begin())
> > + return;
> > +
> > + if (kfree_reader_tasks) {
> > + for (i = 0; i < kfree_nrealthreads; i++)
> > + torture_stop_kthread(kfree_perf_thread,
> > + kfree_reader_tasks[i]);
> > + kfree(kfree_reader_tasks);
> > + }
> > +
> > + torture_cleanup_end();
> > +}
> > +
> > +/*
> > + * shutdown kthread. Just waits to be awakened, then shuts down system.
> > + */
> > +static int
> > +kfree_perf_shutdown(void *arg)
> > +{
> > + do {
> > + wait_event(shutdown_wq,
> > + atomic_read(&n_kfree_perf_thread_ended) >=
> > + kfree_nrealthreads);
> > + } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
> > +
> > + smp_mb(); /* Wake before output. */
> > +
> > + kfree_perf_cleanup();
> > + kernel_power_off();
> > + return -EINVAL;
> > +}
>
> Is there some way to avoid (almost) duplicating rcu_perf_shutdown()?
At the moment, I don't see a good way to do this without passing in function
pointers or using macros which I think would look uglier than the above
addition. Sorry.
thanks,
- Joel
On Sat, Aug 10, 2019 at 11:24:46AM -0700, Paul E. McKenney wrote:
> On Sat, Aug 10, 2019 at 12:20:37AM -0400, Joel Fernandes wrote:
> > On Fri, Aug 09, 2019 at 08:38:14PM -0700, Paul E. McKenney wrote:
> > > On Fri, Aug 09, 2019 at 10:42:32PM -0400, Joel Fernandes wrote:
> > > > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > [snip]
> > > > > > > > @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> > > > > > > > {
> > > > > > > > int cpu;
> > > > > > > >
> > > > > > > > + kfree_rcu_batch_init();
> > > > > > >
> > > > > > > What happens if someone does a kfree_rcu() before this point? It looks
> > > > > > > like it should work, but have you tested it?
> > > > > > >
> > > > > > > > rcu_early_boot_tests();
> > > > > > >
> > > > > > > For example, by testing it in rcu_early_boot_tests() and moving the
> > > > > > > call to kfree_rcu_batch_init() here.
> > > > > >
> > > > > > I have not tried to do the kfree_rcu() this early. I will try it out.
> > > > >
> > > > > Yeah, well, call_rcu() this early came as a surprise to me back in the
> > > > > day, so... ;-)
> > > >
> > > > I actually did get surprised as well!
> > > >
> > > > It appears the timers are not fully initialized so the really early
> > > > kfree_rcu() call from rcu_init() does cause a splat about an initialized
> > > > timer spinlock (even though future kfree_rcu()s and the system are working
> > > > fine all the way into the torture tests).
> > > >
> > > > I think to resolve this, we can just not do batching until early_initcall,
> > > > during which I have an initialization function which switches batching on.
> > > > >From that point it is safe.
> > >
> > > Just go ahead and batch, but don't bother with the timer until
> > > after single-threaded boot is done. For example, you could check
> > > rcu_scheduler_active similar to how sync_rcu_exp_select_cpus() does.
> > > (See kernel/rcu/tree_exp.h.)
> >
> > Cool, that works nicely and I tested it. Actually I made it such that we
> > don't need to batch even, before the scheduler is up. I don't see any benefit
> > of that unless we can see a kfree_rcu() flood happening that early at boot
> > which seems highly doubtful as a real world case.
>
> The benefit is removing the kfree_rcu() special cases from the innards
> of RCU, for example, in rcu_do_batch(). Another benefit is removing the
> current restriction on the position of the rcu_head structure within the
> enclosing data structure.
>
> So it would be good to avoid the current kfree_rcu() special casing within
> RCU itself.
>
> Or are you using some trick that avoids both the batching and the current
> kfree_rcu() special casing?
Oh. I see what you mean. Would it be Ok with you to have that be a follow up
patch? I am not getting rid (yet) of the special casing in rcu_do_batch in
this patch, but can do that in another patch.
For now I am just doing something like the following in kfree_call_rcu(). I
was almost about to hit send on the v1 and I have been testing this a lot so
I'll post it anyway; and we can discuss more about this point on that.
+void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+ unsigned long flags;
+ struct kfree_rcu_cpu *krcp;
+ bool monitor_todo;
+
+ /* kfree_call_rcu() batching requires timers to be up. If the scheduler
+ * is not yet up, just skip batching and do non-batched kfree_call_rcu().
+ */
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
+ return kfree_call_rcu_nobatch(head, func);
+
thanks,
- Joel
On Thu, Aug 08, 2019 at 11:09:16AM -0700, Paul E. McKenney wrote:
> On Thu, Aug 08, 2019 at 11:23:17PM +0900, Byungchul Park wrote:
> > On Thu, Aug 8, 2019 at 9:56 PM Joel Fernandes <[email protected]> wrote:
> > >
> > > On Thu, Aug 08, 2019 at 06:52:32PM +0900, Byungchul Park wrote:
> > > > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > > > [ . . . ]
> > > > > > > > + for (; head; head = next) {
> > > > > > > > + next = head->next;
> > > > > > > > + head->next = NULL;
> > > > > > > > + __call_rcu(head, head->func, -1, 1);
> > > > > > >
> > > > > > > We need at least a cond_resched() here. 200,000 times through this loop
> > > > > > > in a PREEMPT=n kernel might not always be pretty. Except that this is
> > > > > > > invoked directly from kfree_rcu() which might be invoked with interrupts
> > > > > > > disabled, which precludes calls to cond_resched(). So the realtime guys
> > > > > > > are not going to be at all happy with this loop.
> > > > > >
> > > > > > Ok, will add this here.
> > > > > >
> > > > > > > And this loop could be avoided entirely by having a third rcu_head list
> > > > > > > in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> > > > > > > KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> > > > > > > should be OK, or at least more OK than queuing 200,000 callbacks with
> > > > > > > interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> > > > > > > pointers can be used to reduce the probability of oversized batches.)
> > > > > > > This would also mean that the equality comparisons with KFREE_MAX_BATCH
> > > > > > > need to become greater-or-equal comparisons or some such.
> > > > > >
> > > > > > Yes, certainly we can do these kinds of improvements after this patch, and
> > > > > > then add more tests to validate the improvements.
> > > > >
> > > > > Out of pity for people bisecting, we need this fixed up front.
> > > > >
> > > > > My suggestion is to just allow ->head to grow until ->head_free becomes
> > > > > available. That way you are looping with interrupts and preemption
> > > > > enabled in workqueue context, which is much less damaging than doing so
> > > > > with interrupts disabled, and possibly even from hard-irq context.
> > > >
> > > > Agree.
> > > >
> > > > Or after introducing another limit like KFREE_MAX_BATCH_FORCE(>=
> > > > KFREE_MAX_BATCH):
> > > >
> > > > 1. Try to drain it on hitting KFREE_MAX_BATCH as it does.
> > > >
> > > > On success: Same as now.
> > > > On fail: let ->head grow and drain if possible, until reaching to
> > > > KFREE_MAX_BATCH_FORCE.
> >
> > I should've explain this in more detail. This actually mean:
> >
> > On fail: Let ->head grow and queue rcu_work when ->head_free == NULL,
> > until reaching to _FORCE.
> >
> > > > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching but handle one by
> > > > one from now on to prevent too many pending requests from being
> > > > queued for batching work.
> >
> > This mean:
> >
> > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching requests to be added
> > from now on but instead handle one by one to prevent too many
> > pending requests
Oh! I'm sorry for the weird formatted mail that I wrote with another
mail client than the one I usually use, outside of office.
> > from being queued. Of course, the requests already having been
> > queued in ->head
> > so far should be handled by rcu_work when it's possible which can
> > be checked by
> > the monitor or kfree_rcu() inside every call.
>
> But does this really help? After all, the reason we have piled up a
> large number of additional callbacks is likely because the grace period
> is taking a long time, or because a huge number of callbacks has been
> queued up. Sure, these callbacks might get a head start on the following
> grace period, but at the expense of still retaining the kfree_rcu()
> special cases in rcu_do_batch().
Now, I just can see what you want to get with this work. Then we'd
better avoid that kind of exception as much as possible.
> Another potential issue is interaction with rcu_barrier(). Currently,
> rcu_barrier() waits for memory passed to prior kfree_rcu() calls to be
> freed. This is useful to allow a large amount of memory be be completely
> freed before allocating large amounts more memory. With the earlier
> version of the patch, an rcu_barrier() followed by a flush_workqueue().
> But #3 above would reorder the objects so that this approach might not
> wait for everything.
It doesn't matter by making the queue operated in FIFO manner though,
so as to guarantee the order.
But now that we can see letting the list just grow works well, we don't
have to consider this one at the moment. Let's consider this method
again once we face the problem in the future by any chance.
> We should therefore just let the second list grow. If experience shows
> a need for callbacks to be sent up more quickly, it should be possible
> to provide an additional list, so that two lists on a given CPU can both
> be waiting for a grace period at the same time.
Or the third and fourth list might be needed in some system. But let's
talk about it later too.
> > > I also agree. But this _FORCE thing will still not solve the issue Paul is
> > > raising which is doing this loop possibly in irq disabled / hardirq context.
> >
> > I added more explanation above. What I suggested is a way to avoid not
> > only heavy
> > work within the irq-disabled region of a single kfree_rcu() but also
> > too many requests
> > to be queued into ->head.
>
> But let's start simple, please!
Yes. The simpler, the better.
> > > We can't even cond_resched() here. In fact since _FORCE is larger, it will be
> > > even worse. Consider a real-time system with a lot of memory, in this case
> > > letting ->head grow large is Ok, but looping for long time in IRQ disabled
> > > would not be Ok.
> >
> > Please check the explanation above.
> >
> > > But I could make it something like:
> > > 1. Letting ->head grow if ->head_free busy
> > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> >
> > This is exactly what Paul said. The problem with this is ->head can grow too
> > much. That's why I suggested the above one.
>
> It can grow quite large, but how do you know that limiting its size will
> really help? Sure, you have limited the size, but does that really do
To decide the size, we might have to refer to how much pressure on
memory and RCU there are at that moment and adjust it on runtime.
> anything for the larger problem of extreme kfree_rcu() rates on the one
> hand and a desire for more efficient handling of kfree_rcu() on the other?
Assuming current RCU logic handles extremly high rate well which is
anyway true, my answer is *yes*, because batching anyway has pros and
cons. One of major cons is there must be inevitable kfree_rcu() requests
that not even request to RCU. By allowing only the size of batching, the
situation can be mitigated.
I just answered to you. But again, let's talk about it later once we
face the problem as you said.
Thanks,
Byungchul
> Thanx, Paul
>
> > > This would even improve performance, but will still risk going out of memory.
> > >
> > > Thoughts?
> > >
> > > thanks,
> > >
> > > - Joel
> > >
> > > >
> > > > This way, we can avoid both:
> > > >
> > > > 1. too many requests being queued and
> > > > 2. __call_rcu() bunch of requests within a single kfree_rcu().
> > > >
> > > > Thanks,
> > > > Byungchul
> > > >
> > > > >
> > > > > But please feel free to come up with a better solution!
> > > > >
> > > > > [ . . . ]
> >
> >
> >
> > --
> > Thanks,
> > Byungchul
> >
On Sun, Aug 11, 2019 at 05:36:26PM +0900, Byungchul Park wrote:
> On Thu, Aug 08, 2019 at 11:09:16AM -0700, Paul E. McKenney wrote:
> > On Thu, Aug 08, 2019 at 11:23:17PM +0900, Byungchul Park wrote:
> > > On Thu, Aug 8, 2019 at 9:56 PM Joel Fernandes <[email protected]> wrote:
> > > >
> > > > On Thu, Aug 08, 2019 at 06:52:32PM +0900, Byungchul Park wrote:
> > > > > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > > > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > > > > [ . . . ]
> > > > > > > > > + for (; head; head = next) {
> > > > > > > > > + next = head->next;
> > > > > > > > > + head->next = NULL;
> > > > > > > > > + __call_rcu(head, head->func, -1, 1);
> > > > > > > >
> > > > > > > > We need at least a cond_resched() here. 200,000 times through this loop
> > > > > > > > in a PREEMPT=n kernel might not always be pretty. Except that this is
> > > > > > > > invoked directly from kfree_rcu() which might be invoked with interrupts
> > > > > > > > disabled, which precludes calls to cond_resched(). So the realtime guys
> > > > > > > > are not going to be at all happy with this loop.
> > > > > > >
> > > > > > > Ok, will add this here.
> > > > > > >
> > > > > > > > And this loop could be avoided entirely by having a third rcu_head list
> > > > > > > > in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> > > > > > > > KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> > > > > > > > should be OK, or at least more OK than queuing 200,000 callbacks with
> > > > > > > > interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> > > > > > > > pointers can be used to reduce the probability of oversized batches.)
> > > > > > > > This would also mean that the equality comparisons with KFREE_MAX_BATCH
> > > > > > > > need to become greater-or-equal comparisons or some such.
> > > > > > >
> > > > > > > Yes, certainly we can do these kinds of improvements after this patch, and
> > > > > > > then add more tests to validate the improvements.
> > > > > >
> > > > > > Out of pity for people bisecting, we need this fixed up front.
> > > > > >
> > > > > > My suggestion is to just allow ->head to grow until ->head_free becomes
> > > > > > available. That way you are looping with interrupts and preemption
> > > > > > enabled in workqueue context, which is much less damaging than doing so
> > > > > > with interrupts disabled, and possibly even from hard-irq context.
> > > > >
> > > > > Agree.
> > > > >
> > > > > Or after introducing another limit like KFREE_MAX_BATCH_FORCE(>=
> > > > > KFREE_MAX_BATCH):
> > > > >
> > > > > 1. Try to drain it on hitting KFREE_MAX_BATCH as it does.
> > > > >
> > > > > On success: Same as now.
> > > > > On fail: let ->head grow and drain if possible, until reaching to
> > > > > KFREE_MAX_BATCH_FORCE.
> > >
> > > I should've explain this in more detail. This actually mean:
> > >
> > > On fail: Let ->head grow and queue rcu_work when ->head_free == NULL,
> > > until reaching to _FORCE.
> > >
> > > > > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching but handle one by
> > > > > one from now on to prevent too many pending requests from being
> > > > > queued for batching work.
> > >
> > > This mean:
> > >
> > > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching requests to be added
> > > from now on but instead handle one by one to prevent too many
> > > pending requests
>
> Oh! I'm sorry for the weird formatted mail that I wrote with another
> mail client than the one I usually use, outside of office.
>
> > > from being queued. Of course, the requests already having been
> > > queued in ->head
> > > so far should be handled by rcu_work when it's possible which can
> > > be checked by
> > > the monitor or kfree_rcu() inside every call.
> >
> > But does this really help? After all, the reason we have piled up a
> > large number of additional callbacks is likely because the grace period
> > is taking a long time, or because a huge number of callbacks has been
> > queued up. Sure, these callbacks might get a head start on the following
> > grace period, but at the expense of still retaining the kfree_rcu()
> > special cases in rcu_do_batch().
>
> Now, I just can see what you want to get with this work. Then we'd
> better avoid that kind of exception as much as possible.
>
> > Another potential issue is interaction with rcu_barrier(). Currently,
> > rcu_barrier() waits for memory passed to prior kfree_rcu() calls to be
> > freed. This is useful to allow a large amount of memory be be completely
> > freed before allocating large amounts more memory. With the earlier
> > version of the patch, an rcu_barrier() followed by a flush_workqueue().
> > But #3 above would reorder the objects so that this approach might not
> > wait for everything.
>
> It doesn't matter by making the queue operated in FIFO manner though,
> so as to guarantee the order.
I only explained about the re-order problem but yes, we need to come up
with how to deal with the synchronization with rcu_barrier() as you said.
Thanks,
Byungchul
> But now that we can see letting the list just grow works well, we don't
> have to consider this one at the moment. Let's consider this method
> again once we face the problem in the future by any chance.
>
> > We should therefore just let the second list grow. If experience shows
> > a need for callbacks to be sent up more quickly, it should be possible
> > to provide an additional list, so that two lists on a given CPU can both
> > be waiting for a grace period at the same time.
>
> Or the third and fourth list might be needed in some system. But let's
> talk about it later too.
>
> > > > I also agree. But this _FORCE thing will still not solve the issue Paul is
> > > > raising which is doing this loop possibly in irq disabled / hardirq context.
> > >
> > > I added more explanation above. What I suggested is a way to avoid not
> > > only heavy
> > > work within the irq-disabled region of a single kfree_rcu() but also
> > > too many requests
> > > to be queued into ->head.
> >
> > But let's start simple, please!
>
> Yes. The simpler, the better.
>
> > > > We can't even cond_resched() here. In fact since _FORCE is larger, it will be
> > > > even worse. Consider a real-time system with a lot of memory, in this case
> > > > letting ->head grow large is Ok, but looping for long time in IRQ disabled
> > > > would not be Ok.
> > >
> > > Please check the explanation above.
> > >
> > > > But I could make it something like:
> > > > 1. Letting ->head grow if ->head_free busy
> > > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> > >
> > > This is exactly what Paul said. The problem with this is ->head can grow too
> > > much. That's why I suggested the above one.
> >
> > It can grow quite large, but how do you know that limiting its size will
> > really help? Sure, you have limited the size, but does that really do
>
> To decide the size, we might have to refer to how much pressure on
> memory and RCU there are at that moment and adjust it on runtime.
>
> > anything for the larger problem of extreme kfree_rcu() rates on the one
> > hand and a desire for more efficient handling of kfree_rcu() on the other?
>
> Assuming current RCU logic handles extremly high rate well which is
> anyway true, my answer is *yes*, because batching anyway has pros and
> cons. One of major cons is there must be inevitable kfree_rcu() requests
> that not even request to RCU. By allowing only the size of batching, the
> situation can be mitigated.
>
> I just answered to you. But again, let's talk about it later once we
> face the problem as you said.
>
> Thanks,
> Byungchul
>
> > Thanx, Paul
> >
> > > > This would even improve performance, but will still risk going out of memory.
> > > >
> > > > Thoughts?
> > > >
> > > > thanks,
> > > >
> > > > - Joel
> > > >
> > > > >
> > > > > This way, we can avoid both:
> > > > >
> > > > > 1. too many requests being queued and
> > > > > 2. __call_rcu() bunch of requests within a single kfree_rcu().
> > > > >
> > > > > Thanks,
> > > > > Byungchul
> > > > >
> > > > > >
> > > > > > But please feel free to come up with a better solution!
> > > > > >
> > > > > > [ . . . ]
> > >
> > >
> > >
> > > --
> > > Thanks,
> > > Byungchul
> > >
On Sun, Aug 11, 2019 at 05:36:26PM +0900, Byungchul Park wrote:
> On Thu, Aug 08, 2019 at 11:09:16AM -0700, Paul E. McKenney wrote:
> > On Thu, Aug 08, 2019 at 11:23:17PM +0900, Byungchul Park wrote:
> > > On Thu, Aug 8, 2019 at 9:56 PM Joel Fernandes <[email protected]> wrote:
> > > >
> > > > On Thu, Aug 08, 2019 at 06:52:32PM +0900, Byungchul Park wrote:
> > > > > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > > > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > > > > [ . . . ]
> > > > > > > > > + for (; head; head = next) {
> > > > > > > > > + next = head->next;
> > > > > > > > > + head->next = NULL;
> > > > > > > > > + __call_rcu(head, head->func, -1, 1);
> > > > > > > >
> > > > > > > > We need at least a cond_resched() here. 200,000 times through this loop
> > > > > > > > in a PREEMPT=n kernel might not always be pretty. Except that this is
> > > > > > > > invoked directly from kfree_rcu() which might be invoked with interrupts
> > > > > > > > disabled, which precludes calls to cond_resched(). So the realtime guys
> > > > > > > > are not going to be at all happy with this loop.
> > > > > > >
> > > > > > > Ok, will add this here.
> > > > > > >
> > > > > > > > And this loop could be avoided entirely by having a third rcu_head list
> > > > > > > > in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> > > > > > > > KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> > > > > > > > should be OK, or at least more OK than queuing 200,000 callbacks with
> > > > > > > > interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> > > > > > > > pointers can be used to reduce the probability of oversized batches.)
> > > > > > > > This would also mean that the equality comparisons with KFREE_MAX_BATCH
> > > > > > > > need to become greater-or-equal comparisons or some such.
> > > > > > >
> > > > > > > Yes, certainly we can do these kinds of improvements after this patch, and
> > > > > > > then add more tests to validate the improvements.
> > > > > >
> > > > > > Out of pity for people bisecting, we need this fixed up front.
> > > > > >
> > > > > > My suggestion is to just allow ->head to grow until ->head_free becomes
> > > > > > available. That way you are looping with interrupts and preemption
> > > > > > enabled in workqueue context, which is much less damaging than doing so
> > > > > > with interrupts disabled, and possibly even from hard-irq context.
> > > > >
> > > > > Agree.
> > > > >
> > > > > Or after introducing another limit like KFREE_MAX_BATCH_FORCE(>=
> > > > > KFREE_MAX_BATCH):
> > > > >
> > > > > 1. Try to drain it on hitting KFREE_MAX_BATCH as it does.
> > > > >
> > > > > On success: Same as now.
> > > > > On fail: let ->head grow and drain if possible, until reaching to
> > > > > KFREE_MAX_BATCH_FORCE.
> > >
> > > I should've explain this in more detail. This actually mean:
> > >
> > > On fail: Let ->head grow and queue rcu_work when ->head_free == NULL,
> > > until reaching to _FORCE.
> > >
> > > > > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching but handle one by
> > > > > one from now on to prevent too many pending requests from being
> > > > > queued for batching work.
> > >
> > > This mean:
> > >
> > > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching requests to be added
> > > from now on but instead handle one by one to prevent too many
> > > pending requests
>
> Oh! I'm sorry for the weird formatted mail that I wrote with another
> mail client than the one I usually use, outside of office.
>
> > > from being queued. Of course, the requests already having been
> > > queued in ->head
> > > so far should be handled by rcu_work when it's possible which can
> > > be checked by
> > > the monitor or kfree_rcu() inside every call.
> >
> > But does this really help? After all, the reason we have piled up a
> > large number of additional callbacks is likely because the grace period
> > is taking a long time, or because a huge number of callbacks has been
> > queued up. Sure, these callbacks might get a head start on the following
> > grace period, but at the expense of still retaining the kfree_rcu()
> > special cases in rcu_do_batch().
>
> Now, I just can see what you want to get with this work. Then we'd
> better avoid that kind of exception as much as possible.
>
> > Another potential issue is interaction with rcu_barrier(). Currently,
> > rcu_barrier() waits for memory passed to prior kfree_rcu() calls to be
> > freed. This is useful to allow a large amount of memory be be completely
> > freed before allocating large amounts more memory. With the earlier
> > version of the patch, an rcu_barrier() followed by a flush_workqueue().
> > But #3 above would reorder the objects so that this approach might not
> > wait for everything.
>
> It doesn't matter by making the queue operated in FIFO manner though,
> so as to guarantee the order.
This's not that important until it really matters but *just* in case of
lack of explanation how what I suggested works:
1. Mainline code:
---> time goes by (or sequence of kfree_rcu()s)
A B C D E F G H I J K L M N O P Q R ...
| | | | | | | | | | | | | | | | | |
* * * * * * * * * * * * * * * * * *
where * means requesting to RCU gp.
2. With current patch:
---> time goes by (or sequence of kfree_rcu()s)
ABCD(batching) EFGHIJKLMNOPQR(batching) ...
| |
* *
where * means requesting to RCU gp.
3. What I was talking about:
1) 'A' ~ 'H' have been queued into ->head and ->head_free is not
available yet.
---> time goes by (or sequence of kfree_rcu()s)
ABCD(batching) EFGH(pending)
|
*
where * means requesting to RCU gp.
2) 'I' was just queued into ->head and ->head_free is still not
available.
ABCD(batching) E FGHI(pending)
| |
* *
where * means requesting to RCU gp.
3) 'J' was just queued into ->head and ->head_free is still not
available.
ABCD(batching) E F GHIJ(pending) ...
| | |
* * *
where * means requesting to RCU gp.
3) GHIJ was successfully requested for RCU gp and MLMN have been
queued into ->head and go on...
ABCD(batching) E F GHIJ(batching) KLMN(pending) ...
| | | |
* * * *
where * means requesting to RCU gp.
This explanation doesn't mean I want to use this way right away but just
for explaining how what I suggested works better.
Thanks,
Byungchul
> But now that we can see letting the list just grow works well, we don't
> have to consider this one at the moment. Let's consider this method
> again once we face the problem in the future by any chance.
>
> > We should therefore just let the second list grow. If experience shows
> > a need for callbacks to be sent up more quickly, it should be possible
> > to provide an additional list, so that two lists on a given CPU can both
> > be waiting for a grace period at the same time.
>
> Or the third and fourth list might be needed in some system. But let's
> talk about it later too.
>
> > > > I also agree. But this _FORCE thing will still not solve the issue Paul is
> > > > raising which is doing this loop possibly in irq disabled / hardirq context.
> > >
> > > I added more explanation above. What I suggested is a way to avoid not
> > > only heavy
> > > work within the irq-disabled region of a single kfree_rcu() but also
> > > too many requests
> > > to be queued into ->head.
> >
> > But let's start simple, please!
>
> Yes. The simpler, the better.
>
> > > > We can't even cond_resched() here. In fact since _FORCE is larger, it will be
> > > > even worse. Consider a real-time system with a lot of memory, in this case
> > > > letting ->head grow large is Ok, but looping for long time in IRQ disabled
> > > > would not be Ok.
> > >
> > > Please check the explanation above.
> > >
> > > > But I could make it something like:
> > > > 1. Letting ->head grow if ->head_free busy
> > > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> > >
> > > This is exactly what Paul said. The problem with this is ->head can grow too
> > > much. That's why I suggested the above one.
> >
> > It can grow quite large, but how do you know that limiting its size will
> > really help? Sure, you have limited the size, but does that really do
>
> To decide the size, we might have to refer to how much pressure on
> memory and RCU there are at that moment and adjust it on runtime.
>
> > anything for the larger problem of extreme kfree_rcu() rates on the one
> > hand and a desire for more efficient handling of kfree_rcu() on the other?
>
> Assuming current RCU logic handles extremly high rate well which is
> anyway true, my answer is *yes*, because batching anyway has pros and
> cons. One of major cons is there must be inevitable kfree_rcu() requests
> that not even request to RCU. By allowing only the size of batching, the
> situation can be mitigated.
>
> I just answered to you. But again, let's talk about it later once we
> face the problem as you said.
>
> Thanks,
> Byungchul
>
> > Thanx, Paul
> >
> > > > This would even improve performance, but will still risk going out of memory.
> > > >
> > > > Thoughts?
> > > >
> > > > thanks,
> > > >
> > > > - Joel
> > > >
> > > > >
> > > > > This way, we can avoid both:
> > > > >
> > > > > 1. too many requests being queued and
> > > > > 2. __call_rcu() bunch of requests within a single kfree_rcu().
> > > > >
> > > > > Thanks,
> > > > > Byungchul
> > > > >
> > > > > >
> > > > > > But please feel free to come up with a better solution!
> > > > > >
> > > > > > [ . . . ]
> > >
> > >
> > >
> > > --
> > > Thanks,
> > > Byungchul
> > >
On Sat, Aug 10, 2019 at 10:26:58PM -0400, Joel Fernandes wrote:
> On Sat, Aug 10, 2019 at 11:24:46AM -0700, Paul E. McKenney wrote:
> > On Sat, Aug 10, 2019 at 12:20:37AM -0400, Joel Fernandes wrote:
> > > On Fri, Aug 09, 2019 at 08:38:14PM -0700, Paul E. McKenney wrote:
> > > > On Fri, Aug 09, 2019 at 10:42:32PM -0400, Joel Fernandes wrote:
> > > > > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > > [snip]
> > > > > > > > > @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> > > > > > > > > {
> > > > > > > > > int cpu;
> > > > > > > > >
> > > > > > > > > + kfree_rcu_batch_init();
> > > > > > > >
> > > > > > > > What happens if someone does a kfree_rcu() before this point? It looks
> > > > > > > > like it should work, but have you tested it?
> > > > > > > >
> > > > > > > > > rcu_early_boot_tests();
> > > > > > > >
> > > > > > > > For example, by testing it in rcu_early_boot_tests() and moving the
> > > > > > > > call to kfree_rcu_batch_init() here.
> > > > > > >
> > > > > > > I have not tried to do the kfree_rcu() this early. I will try it out.
> > > > > >
> > > > > > Yeah, well, call_rcu() this early came as a surprise to me back in the
> > > > > > day, so... ;-)
> > > > >
> > > > > I actually did get surprised as well!
> > > > >
> > > > > It appears the timers are not fully initialized so the really early
> > > > > kfree_rcu() call from rcu_init() does cause a splat about an initialized
> > > > > timer spinlock (even though future kfree_rcu()s and the system are working
> > > > > fine all the way into the torture tests).
> > > > >
> > > > > I think to resolve this, we can just not do batching until early_initcall,
> > > > > during which I have an initialization function which switches batching on.
> > > > > >From that point it is safe.
> > > >
> > > > Just go ahead and batch, but don't bother with the timer until
> > > > after single-threaded boot is done. For example, you could check
> > > > rcu_scheduler_active similar to how sync_rcu_exp_select_cpus() does.
> > > > (See kernel/rcu/tree_exp.h.)
> > >
> > > Cool, that works nicely and I tested it. Actually I made it such that we
> > > don't need to batch even, before the scheduler is up. I don't see any benefit
> > > of that unless we can see a kfree_rcu() flood happening that early at boot
> > > which seems highly doubtful as a real world case.
> >
> > The benefit is removing the kfree_rcu() special cases from the innards
> > of RCU, for example, in rcu_do_batch(). Another benefit is removing the
> > current restriction on the position of the rcu_head structure within the
> > enclosing data structure.
> >
> > So it would be good to avoid the current kfree_rcu() special casing within
> > RCU itself.
> >
> > Or are you using some trick that avoids both the batching and the current
> > kfree_rcu() special casing?
>
> Oh. I see what you mean. Would it be Ok with you to have that be a follow up
> patch? I am not getting rid (yet) of the special casing in rcu_do_batch in
> this patch, but can do that in another patch.
I am OK having that in another patch, and I will be looking over yours
and Byungchul's two patches tomorrow. If they look OK, I will queue them.
However, I won't send them upstream without a follow-on patch that gets
rid of the kfree_rcu() special casing within rcu_do_batch() and perhaps
elsewhere. This follow-on patch would of course also need to change rcuperf
appropriately.
> For now I am just doing something like the following in kfree_call_rcu(). I
> was almost about to hit send on the v1 and I have been testing this a lot so
> I'll post it anyway; and we can discuss more about this point on that.
>
> +void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> +{
> + unsigned long flags;
> + struct kfree_rcu_cpu *krcp;
> + bool monitor_todo;
> +
> + /* kfree_call_rcu() batching requires timers to be up. If the scheduler
> + * is not yet up, just skip batching and do non-batched kfree_call_rcu().
> + */
> + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
> + return kfree_call_rcu_nobatch(head, func);
> +
As a stopgap until the follow-on patch, this looks fine.
Thanx, Paul
On Sat, Aug 10, 2019 at 10:01:54PM -0400, Joel Fernandes wrote:
> On Tue, Aug 06, 2019 at 05:29:15PM -0700, Paul E. McKenney wrote:
> > On Tue, Aug 06, 2019 at 05:20:41PM -0400, Joel Fernandes (Google) wrote:
[ . . . ]
> > It is really easy to confuse "l" and "1" in some fonts, so please use
> > a different name. (From the "showing my age" department: On typical
> > 1970s typewriters, there was no numeral "1" -- you typed the letter
> > "l" instead, thus anticipating at least the first digit of "1337".)
>
> Change l to loops ;). I did see typewriters around in my childhood, I thought
> they were pretty odd machines :-D. I am sure my daughter will think the same
> about land-line phones :-D
Given your daughter's life expectancy, there will likely be a great many
ca-2019 artifacts that will eventually seem quite odd to her. ;-)
[ . . . ]
> > > +/*
> > > + * shutdown kthread. Just waits to be awakened, then shuts down system.
> > > + */
> > > +static int
> > > +kfree_perf_shutdown(void *arg)
> > > +{
> > > + do {
> > > + wait_event(shutdown_wq,
> > > + atomic_read(&n_kfree_perf_thread_ended) >=
> > > + kfree_nrealthreads);
> > > + } while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
> > > +
> > > + smp_mb(); /* Wake before output. */
> > > +
> > > + kfree_perf_cleanup();
> > > + kernel_power_off();
> > > + return -EINVAL;
> > > +}
> >
> > Is there some way to avoid (almost) duplicating rcu_perf_shutdown()?
>
> At the moment, I don't see a good way to do this without passing in function
> pointers or using macros which I think would look uglier than the above
> addition. Sorry.
No problem, just something to keep in mind.
Thanx, Paul
On Sun, Aug 11, 2019 at 05:49:50PM +0900, Byungchul Park wrote:
> On Sun, Aug 11, 2019 at 05:36:26PM +0900, Byungchul Park wrote:
> > On Thu, Aug 08, 2019 at 11:09:16AM -0700, Paul E. McKenney wrote:
> > > On Thu, Aug 08, 2019 at 11:23:17PM +0900, Byungchul Park wrote:
> > > > On Thu, Aug 8, 2019 at 9:56 PM Joel Fernandes <[email protected]> wrote:
> > > > >
> > > > > On Thu, Aug 08, 2019 at 06:52:32PM +0900, Byungchul Park wrote:
> > > > > > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > > > > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > > > > > [ . . . ]
> > > > > > > > > > + for (; head; head = next) {
> > > > > > > > > > + next = head->next;
> > > > > > > > > > + head->next = NULL;
> > > > > > > > > > + __call_rcu(head, head->func, -1, 1);
> > > > > > > > >
> > > > > > > > > We need at least a cond_resched() here. 200,000 times through this loop
> > > > > > > > > in a PREEMPT=n kernel might not always be pretty. Except that this is
> > > > > > > > > invoked directly from kfree_rcu() which might be invoked with interrupts
> > > > > > > > > disabled, which precludes calls to cond_resched(). So the realtime guys
> > > > > > > > > are not going to be at all happy with this loop.
> > > > > > > >
> > > > > > > > Ok, will add this here.
> > > > > > > >
> > > > > > > > > And this loop could be avoided entirely by having a third rcu_head list
> > > > > > > > > in the kfree_rcu_cpu structure. Yes, some of the batches would exceed
> > > > > > > > > KFREE_MAX_BATCH, but given that they are invoked from a workqueue, that
> > > > > > > > > should be OK, or at least more OK than queuing 200,000 callbacks with
> > > > > > > > > interrupts disabled. (If it turns out not to be OK, an array of rcu_head
> > > > > > > > > pointers can be used to reduce the probability of oversized batches.)
> > > > > > > > > This would also mean that the equality comparisons with KFREE_MAX_BATCH
> > > > > > > > > need to become greater-or-equal comparisons or some such.
> > > > > > > >
> > > > > > > > Yes, certainly we can do these kinds of improvements after this patch, and
> > > > > > > > then add more tests to validate the improvements.
> > > > > > >
> > > > > > > Out of pity for people bisecting, we need this fixed up front.
> > > > > > >
> > > > > > > My suggestion is to just allow ->head to grow until ->head_free becomes
> > > > > > > available. That way you are looping with interrupts and preemption
> > > > > > > enabled in workqueue context, which is much less damaging than doing so
> > > > > > > with interrupts disabled, and possibly even from hard-irq context.
> > > > > >
> > > > > > Agree.
> > > > > >
> > > > > > Or after introducing another limit like KFREE_MAX_BATCH_FORCE(>=
> > > > > > KFREE_MAX_BATCH):
> > > > > >
> > > > > > 1. Try to drain it on hitting KFREE_MAX_BATCH as it does.
> > > > > >
> > > > > > On success: Same as now.
> > > > > > On fail: let ->head grow and drain if possible, until reaching to
> > > > > > KFREE_MAX_BATCH_FORCE.
> > > >
> > > > I should've explain this in more detail. This actually mean:
> > > >
> > > > On fail: Let ->head grow and queue rcu_work when ->head_free == NULL,
> > > > until reaching to _FORCE.
> > > >
> > > > > > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching but handle one by
> > > > > > one from now on to prevent too many pending requests from being
> > > > > > queued for batching work.
> > > >
> > > > This mean:
> > > >
> > > > 3. On hitting KFREE_MAX_BATCH_FORCE, give up batching requests to be added
> > > > from now on but instead handle one by one to prevent too many
> > > > pending requests
> >
> > Oh! I'm sorry for the weird formatted mail that I wrote with another
> > mail client than the one I usually use, outside of office.
> >
> > > > from being queued. Of course, the requests already having been
> > > > queued in ->head
> > > > so far should be handled by rcu_work when it's possible which can
> > > > be checked by
> > > > the monitor or kfree_rcu() inside every call.
> > >
> > > But does this really help? After all, the reason we have piled up a
> > > large number of additional callbacks is likely because the grace period
> > > is taking a long time, or because a huge number of callbacks has been
> > > queued up. Sure, these callbacks might get a head start on the following
> > > grace period, but at the expense of still retaining the kfree_rcu()
> > > special cases in rcu_do_batch().
> >
> > Now, I just can see what you want to get with this work. Then we'd
> > better avoid that kind of exception as much as possible.
> >
> > > Another potential issue is interaction with rcu_barrier(). Currently,
> > > rcu_barrier() waits for memory passed to prior kfree_rcu() calls to be
> > > freed. This is useful to allow a large amount of memory be be completely
> > > freed before allocating large amounts more memory. With the earlier
> > > version of the patch, an rcu_barrier() followed by a flush_workqueue().
> > > But #3 above would reorder the objects so that this approach might not
> > > wait for everything.
> >
> > It doesn't matter by making the queue operated in FIFO manner though,
> > so as to guarantee the order.
>
> I only explained about the re-order problem but yes, we need to come up
> with how to deal with the synchronization with rcu_barrier() as you said.
Maybe. Note well that I said "potential issue". When I checked a few
years ago, none of the uses of rcu_barrier() cared about kfree_rcu().
They cared instead about call_rcu() callbacks that accessed code or data
that was going to disappear soon, for example, due to module unload or
filesystem unmount.
So it -might- be that rcu_barrier() can stay as it is, but with changes
as needed to documentation.
It also -might- be, maybe now or maybe some time in the future, that
there will need to be a kfree_rcu_barrier() or some such. But if so,
let's not create it until it is needed. For one thing, it is reasonably
likely that something other than a kfree_rcu_barrier() would really
be what was needed. After all, the main point would be to make sure
that the old memory really was freed before allocating new memory.
But if the system had ample memory, why wait? In that case you don't
really need to wait for all the old memory to be freed, but rather for
sufficient memory to be available for allocation.
Thanx, Paul
> Thanks,
> Byungchul
>
> > But now that we can see letting the list just grow works well, we don't
> > have to consider this one at the moment. Let's consider this method
> > again once we face the problem in the future by any chance.
> >
> > > We should therefore just let the second list grow. If experience shows
> > > a need for callbacks to be sent up more quickly, it should be possible
> > > to provide an additional list, so that two lists on a given CPU can both
> > > be waiting for a grace period at the same time.
> >
> > Or the third and fourth list might be needed in some system. But let's
> > talk about it later too.
> >
> > > > > I also agree. But this _FORCE thing will still not solve the issue Paul is
> > > > > raising which is doing this loop possibly in irq disabled / hardirq context.
> > > >
> > > > I added more explanation above. What I suggested is a way to avoid not
> > > > only heavy
> > > > work within the irq-disabled region of a single kfree_rcu() but also
> > > > too many requests
> > > > to be queued into ->head.
> > >
> > > But let's start simple, please!
> >
> > Yes. The simpler, the better.
> >
> > > > > We can't even cond_resched() here. In fact since _FORCE is larger, it will be
> > > > > even worse. Consider a real-time system with a lot of memory, in this case
> > > > > letting ->head grow large is Ok, but looping for long time in IRQ disabled
> > > > > would not be Ok.
> > > >
> > > > Please check the explanation above.
> > > >
> > > > > But I could make it something like:
> > > > > 1. Letting ->head grow if ->head_free busy
> > > > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> > > >
> > > > This is exactly what Paul said. The problem with this is ->head can grow too
> > > > much. That's why I suggested the above one.
> > >
> > > It can grow quite large, but how do you know that limiting its size will
> > > really help? Sure, you have limited the size, but does that really do
> >
> > To decide the size, we might have to refer to how much pressure on
> > memory and RCU there are at that moment and adjust it on runtime.
> >
> > > anything for the larger problem of extreme kfree_rcu() rates on the one
> > > hand and a desire for more efficient handling of kfree_rcu() on the other?
> >
> > Assuming current RCU logic handles extremly high rate well which is
> > anyway true, my answer is *yes*, because batching anyway has pros and
> > cons. One of major cons is there must be inevitable kfree_rcu() requests
> > that not even request to RCU. By allowing only the size of batching, the
> > situation can be mitigated.
> >
> > I just answered to you. But again, let's talk about it later once we
> > face the problem as you said.
> >
> > Thanks,
> > Byungchul
> >
> > > Thanx, Paul
> > >
> > > > > This would even improve performance, but will still risk going out of memory.
> > > > >
> > > > > Thoughts?
> > > > >
> > > > > thanks,
> > > > >
> > > > > - Joel
> > > > >
> > > > > >
> > > > > > This way, we can avoid both:
> > > > > >
> > > > > > 1. too many requests being queued and
> > > > > > 2. __call_rcu() bunch of requests within a single kfree_rcu().
> > > > > >
> > > > > > Thanks,
> > > > > > Byungchul
> > > > > >
> > > > > > >
> > > > > > > But please feel free to come up with a better solution!
> > > > > > >
> > > > > > > [ . . . ]
> > > >
> > > >
> > > >
> > > > --
> > > > Thanks,
> > > > Byungchul
> > > >
On Sun, Aug 11, 2019 at 04:49:39PM -0700, Paul E. McKenney wrote:
> Maybe. Note well that I said "potential issue". When I checked a few
> years ago, none of the uses of rcu_barrier() cared about kfree_rcu().
> They cared instead about call_rcu() callbacks that accessed code or data
> that was going to disappear soon, for example, due to module unload or
> filesystem unmount.
>
> So it -might- be that rcu_barrier() can stay as it is, but with changes
> as needed to documentation.
>
> It also -might- be, maybe now or maybe some time in the future, that
> there will need to be a kfree_rcu_barrier() or some such. But if so,
> let's not create it until it is needed. For one thing, it is reasonably
> likely that something other than a kfree_rcu_barrier() would really
> be what was needed. After all, the main point would be to make sure
> that the old memory really was freed before allocating new memory.
Now I fully understand what you meant thanks to you. Thank you for
explaining it in detail.
> But if the system had ample memory, why wait? In that case you don't
> really need to wait for all the old memory to be freed, but rather for
> sufficient memory to be available for allocation.
Agree. Totally make sense.
Thanks,
Byungchul
>
> Thanx, Paul
>
> > Thanks,
> > Byungchul
> >
> > > But now that we can see letting the list just grow works well, we don't
> > > have to consider this one at the moment. Let's consider this method
> > > again once we face the problem in the future by any chance.
> > >
> > > > We should therefore just let the second list grow. If experience shows
> > > > a need for callbacks to be sent up more quickly, it should be possible
> > > > to provide an additional list, so that two lists on a given CPU can both
> > > > be waiting for a grace period at the same time.
> > >
> > > Or the third and fourth list might be needed in some system. But let's
> > > talk about it later too.
> > >
> > > > > > I also agree. But this _FORCE thing will still not solve the issue Paul is
> > > > > > raising which is doing this loop possibly in irq disabled / hardirq context.
> > > > >
> > > > > I added more explanation above. What I suggested is a way to avoid not
> > > > > only heavy
> > > > > work within the irq-disabled region of a single kfree_rcu() but also
> > > > > too many requests
> > > > > to be queued into ->head.
> > > >
> > > > But let's start simple, please!
> > >
> > > Yes. The simpler, the better.
> > >
> > > > > > We can't even cond_resched() here. In fact since _FORCE is larger, it will be
> > > > > > even worse. Consider a real-time system with a lot of memory, in this case
> > > > > > letting ->head grow large is Ok, but looping for long time in IRQ disabled
> > > > > > would not be Ok.
> > > > >
> > > > > Please check the explanation above.
> > > > >
> > > > > > But I could make it something like:
> > > > > > 1. Letting ->head grow if ->head_free busy
> > > > > > 2. If head_free is busy, then just queue/requeue the monitor to try again.
> > > > >
> > > > > This is exactly what Paul said. The problem with this is ->head can grow too
> > > > > much. That's why I suggested the above one.
> > > >
> > > > It can grow quite large, but how do you know that limiting its size will
> > > > really help? Sure, you have limited the size, but does that really do
> > >
> > > To decide the size, we might have to refer to how much pressure on
> > > memory and RCU there are at that moment and adjust it on runtime.
> > >
> > > > anything for the larger problem of extreme kfree_rcu() rates on the one
> > > > hand and a desire for more efficient handling of kfree_rcu() on the other?
> > >
> > > Assuming current RCU logic handles extremly high rate well which is
> > > anyway true, my answer is *yes*, because batching anyway has pros and
> > > cons. One of major cons is there must be inevitable kfree_rcu() requests
> > > that not even request to RCU. By allowing only the size of batching, the
> > > situation can be mitigated.
> > >
> > > I just answered to you. But again, let's talk about it later once we
> > > face the problem as you said.
> > >
> > > Thanks,
> > > Byungchul
> > >
> > > > Thanx, Paul
> > > >
> > > > > > This would even improve performance, but will still risk going out of memory.
> > > > > >
> > > > > > Thoughts?
> > > > > >
> > > > > > thanks,
> > > > > >
> > > > > > - Joel
> > > > > >
> > > > > > >
> > > > > > > This way, we can avoid both:
> > > > > > >
> > > > > > > 1. too many requests being queued and
> > > > > > > 2. __call_rcu() bunch of requests within a single kfree_rcu().
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Byungchul
> > > > > > >
> > > > > > > >
> > > > > > > > But please feel free to come up with a better solution!
> > > > > > > >
> > > > > > > > [ . . . ]
> > > > >
> > > > >
> > > > >
> > > > > --
> > > > > Thanks,
> > > > > Byungchul
> > > > >
On Mon, Aug 12, 2019 at 07:10:52PM +0900, Byungchul Park wrote:
> On Sun, Aug 11, 2019 at 04:49:39PM -0700, Paul E. McKenney wrote:
> > Maybe. Note well that I said "potential issue". When I checked a few
> > years ago, none of the uses of rcu_barrier() cared about kfree_rcu().
> > They cared instead about call_rcu() callbacks that accessed code or data
> > that was going to disappear soon, for example, due to module unload or
> > filesystem unmount.
> >
> > So it -might- be that rcu_barrier() can stay as it is, but with changes
> > as needed to documentation.
Right, we should update the docs. Byungchul, do you mind sending a patch that
documents the rcu_barrier() behavior?
> > It also -might- be, maybe now or maybe some time in the future, that
> > there will need to be a kfree_rcu_barrier() or some such. But if so,
> > let's not create it until it is needed. For one thing, it is reasonably
> > likely that something other than a kfree_rcu_barrier() would really
> > be what was needed. After all, the main point would be to make sure
> > that the old memory really was freed before allocating new memory.
>
> Now I fully understand what you meant thanks to you. Thank you for
> explaining it in detail.
>
> > But if the system had ample memory, why wait? In that case you don't
> > really need to wait for all the old memory to be freed, but rather for
> > sufficient memory to be available for allocation.
>
> Agree. Totally make sense.
Agreed, all makes sense.
thanks,
- Joel
[snip]
On Sun, Aug 11, 2019 at 04:35:04PM -0700, Paul E. McKenney wrote:
> On Sat, Aug 10, 2019 at 10:26:58PM -0400, Joel Fernandes wrote:
> > On Sat, Aug 10, 2019 at 11:24:46AM -0700, Paul E. McKenney wrote:
> > > On Sat, Aug 10, 2019 at 12:20:37AM -0400, Joel Fernandes wrote:
> > > > On Fri, Aug 09, 2019 at 08:38:14PM -0700, Paul E. McKenney wrote:
> > > > > On Fri, Aug 09, 2019 at 10:42:32PM -0400, Joel Fernandes wrote:
> > > > > > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > > > [snip]
> > > > > > > > > > @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> > > > > > > > > > {
> > > > > > > > > > int cpu;
> > > > > > > > > >
> > > > > > > > > > + kfree_rcu_batch_init();
> > > > > > > > >
> > > > > > > > > What happens if someone does a kfree_rcu() before this point? It looks
> > > > > > > > > like it should work, but have you tested it?
> > > > > > > > >
> > > > > > > > > > rcu_early_boot_tests();
> > > > > > > > >
> > > > > > > > > For example, by testing it in rcu_early_boot_tests() and moving the
> > > > > > > > > call to kfree_rcu_batch_init() here.
> > > > > > > >
> > > > > > > > I have not tried to do the kfree_rcu() this early. I will try it out.
> > > > > > >
> > > > > > > Yeah, well, call_rcu() this early came as a surprise to me back in the
> > > > > > > day, so... ;-)
> > > > > >
> > > > > > I actually did get surprised as well!
> > > > > >
> > > > > > It appears the timers are not fully initialized so the really early
> > > > > > kfree_rcu() call from rcu_init() does cause a splat about an initialized
> > > > > > timer spinlock (even though future kfree_rcu()s and the system are working
> > > > > > fine all the way into the torture tests).
> > > > > >
> > > > > > I think to resolve this, we can just not do batching until early_initcall,
> > > > > > during which I have an initialization function which switches batching on.
> > > > > > >From that point it is safe.
> > > > >
> > > > > Just go ahead and batch, but don't bother with the timer until
> > > > > after single-threaded boot is done. For example, you could check
> > > > > rcu_scheduler_active similar to how sync_rcu_exp_select_cpus() does.
> > > > > (See kernel/rcu/tree_exp.h.)
> > > >
> > > > Cool, that works nicely and I tested it. Actually I made it such that we
> > > > don't need to batch even, before the scheduler is up. I don't see any benefit
> > > > of that unless we can see a kfree_rcu() flood happening that early at boot
> > > > which seems highly doubtful as a real world case.
> > >
> > > The benefit is removing the kfree_rcu() special cases from the innards
> > > of RCU, for example, in rcu_do_batch(). Another benefit is removing the
> > > current restriction on the position of the rcu_head structure within the
> > > enclosing data structure.
> > >
> > > So it would be good to avoid the current kfree_rcu() special casing within
> > > RCU itself.
> > >
> > > Or are you using some trick that avoids both the batching and the current
> > > kfree_rcu() special casing?
> >
> > Oh. I see what you mean. Would it be Ok with you to have that be a follow up
> > patch? I am not getting rid (yet) of the special casing in rcu_do_batch in
> > this patch, but can do that in another patch.
>
> I am OK having that in another patch, and I will be looking over yours
> and Byungchul's two patches tomorrow. If they look OK, I will queue them.
Ok, some of the code comments are stale as Byungchul pointed, allow me to fix
them and then you can look at v3 directly, to save you the time.
> However, I won't send them upstream without a follow-on patch that gets
> rid of the kfree_rcu() special casing within rcu_do_batch() and perhaps
> elsewhere. This follow-on patch would of course also need to change rcuperf
> appropriately.
Sounds good.
> > For now I am just doing something like the following in kfree_call_rcu(). I
> > was almost about to hit send on the v1 and I have been testing this a lot so
> > I'll post it anyway; and we can discuss more about this point on that.
> >
> > +void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> > +{
> > + unsigned long flags;
> > + struct kfree_rcu_cpu *krcp;
> > + bool monitor_todo;
> > +
> > + /* kfree_call_rcu() batching requires timers to be up. If the scheduler
> > + * is not yet up, just skip batching and do non-batched kfree_call_rcu().
> > + */
> > + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
> > + return kfree_call_rcu_nobatch(head, func);
> > +
>
> As a stopgap until the follow-on patch, this looks fine.
Cool, thanks!
- Joel
On Mon, Aug 12, 2019 at 09:13:56AM -0400, Joel Fernandes wrote:
> On Sun, Aug 11, 2019 at 04:35:04PM -0700, Paul E. McKenney wrote:
> > On Sat, Aug 10, 2019 at 10:26:58PM -0400, Joel Fernandes wrote:
> > > On Sat, Aug 10, 2019 at 11:24:46AM -0700, Paul E. McKenney wrote:
> > > > On Sat, Aug 10, 2019 at 12:20:37AM -0400, Joel Fernandes wrote:
> > > > > On Fri, Aug 09, 2019 at 08:38:14PM -0700, Paul E. McKenney wrote:
> > > > > > On Fri, Aug 09, 2019 at 10:42:32PM -0400, Joel Fernandes wrote:
> > > > > > > On Wed, Aug 07, 2019 at 10:52:15AM -0700, Paul E. McKenney wrote:
> > > > > > > [snip]
> > > > > > > > > > > @@ -3459,6 +3645,8 @@ void __init rcu_init(void)
> > > > > > > > > > > {
> > > > > > > > > > > int cpu;
> > > > > > > > > > >
> > > > > > > > > > > + kfree_rcu_batch_init();
> > > > > > > > > >
> > > > > > > > > > What happens if someone does a kfree_rcu() before this point? It looks
> > > > > > > > > > like it should work, but have you tested it?
> > > > > > > > > >
> > > > > > > > > > > rcu_early_boot_tests();
> > > > > > > > > >
> > > > > > > > > > For example, by testing it in rcu_early_boot_tests() and moving the
> > > > > > > > > > call to kfree_rcu_batch_init() here.
> > > > > > > > >
> > > > > > > > > I have not tried to do the kfree_rcu() this early. I will try it out.
> > > > > > > >
> > > > > > > > Yeah, well, call_rcu() this early came as a surprise to me back in the
> > > > > > > > day, so... ;-)
> > > > > > >
> > > > > > > I actually did get surprised as well!
> > > > > > >
> > > > > > > It appears the timers are not fully initialized so the really early
> > > > > > > kfree_rcu() call from rcu_init() does cause a splat about an initialized
> > > > > > > timer spinlock (even though future kfree_rcu()s and the system are working
> > > > > > > fine all the way into the torture tests).
> > > > > > >
> > > > > > > I think to resolve this, we can just not do batching until early_initcall,
> > > > > > > during which I have an initialization function which switches batching on.
> > > > > > > >From that point it is safe.
> > > > > >
> > > > > > Just go ahead and batch, but don't bother with the timer until
> > > > > > after single-threaded boot is done. For example, you could check
> > > > > > rcu_scheduler_active similar to how sync_rcu_exp_select_cpus() does.
> > > > > > (See kernel/rcu/tree_exp.h.)
> > > > >
> > > > > Cool, that works nicely and I tested it. Actually I made it such that we
> > > > > don't need to batch even, before the scheduler is up. I don't see any benefit
> > > > > of that unless we can see a kfree_rcu() flood happening that early at boot
> > > > > which seems highly doubtful as a real world case.
> > > >
> > > > The benefit is removing the kfree_rcu() special cases from the innards
> > > > of RCU, for example, in rcu_do_batch(). Another benefit is removing the
> > > > current restriction on the position of the rcu_head structure within the
> > > > enclosing data structure.
> > > >
> > > > So it would be good to avoid the current kfree_rcu() special casing within
> > > > RCU itself.
> > > >
> > > > Or are you using some trick that avoids both the batching and the current
> > > > kfree_rcu() special casing?
> > >
> > > Oh. I see what you mean. Would it be Ok with you to have that be a follow up
> > > patch? I am not getting rid (yet) of the special casing in rcu_do_batch in
> > > this patch, but can do that in another patch.
> >
> > I am OK having that in another patch, and I will be looking over yours
> > and Byungchul's two patches tomorrow. If they look OK, I will queue them.
>
> Ok, some of the code comments are stale as Byungchul pointed, allow me to fix
> them and then you can look at v3 directly, to save you the time.
Works for me, thank you!
Thanx, Paul
> > However, I won't send them upstream without a follow-on patch that gets
> > rid of the kfree_rcu() special casing within rcu_do_batch() and perhaps
> > elsewhere. This follow-on patch would of course also need to change rcuperf
> > appropriately.
>
> Sounds good.
>
> > > For now I am just doing something like the following in kfree_call_rcu(). I
> > > was almost about to hit send on the v1 and I have been testing this a lot so
> > > I'll post it anyway; and we can discuss more about this point on that.
> > >
> > > +void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> > > +{
> > > + unsigned long flags;
> > > + struct kfree_rcu_cpu *krcp;
> > > + bool monitor_todo;
> > > +
> > > + /* kfree_call_rcu() batching requires timers to be up. If the scheduler
> > > + * is not yet up, just skip batching and do non-batched kfree_call_rcu().
> > > + */
> > > + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
> > > + return kfree_call_rcu_nobatch(head, func);
> > > +
> >
> > As a stopgap until the follow-on patch, this looks fine.
>
> Cool, thanks!
>
> - Joel
>
On Mon, Aug 12, 2019 at 09:12:34AM -0400, Joel Fernandes wrote:
> On Mon, Aug 12, 2019 at 07:10:52PM +0900, Byungchul Park wrote:
> > On Sun, Aug 11, 2019 at 04:49:39PM -0700, Paul E. McKenney wrote:
> > > Maybe. Note well that I said "potential issue". When I checked a few
> > > years ago, none of the uses of rcu_barrier() cared about kfree_rcu().
> > > They cared instead about call_rcu() callbacks that accessed code or data
> > > that was going to disappear soon, for example, due to module unload or
> > > filesystem unmount.
> > >
> > > So it -might- be that rcu_barrier() can stay as it is, but with changes
> > > as needed to documentation.
>
> Right, we should update the docs. Byungchul, do you mind sending a patch that
> documents the rcu_barrier() behavior?
Are you trying to give me the chance? I feel thankful. It doens't matter
to try it at the moment though, I can't follow-up until September. I'd
better do that in Septamber or give it up this time.
Thanks,
Byungchul
> > > It also -might- be, maybe now or maybe some time in the future, that
> > > there will need to be a kfree_rcu_barrier() or some such. But if so,
> > > let's not create it until it is needed. For one thing, it is reasonably
> > > likely that something other than a kfree_rcu_barrier() would really
> > > be what was needed. After all, the main point would be to make sure
> > > that the old memory really was freed before allocating new memory.
> >
> > Now I fully understand what you meant thanks to you. Thank you for
> > explaining it in detail.
> >
> > > But if the system had ample memory, why wait? In that case you don't
> > > really need to wait for all the old memory to be freed, but rather for
> > > sufficient memory to be available for allocation.
> >
> > Agree. Totally make sense.
>
> Agreed, all makes sense.
>
> thanks,
>
> - Joel
>
> [snip]
On Tue, Aug 13, 2019 at 02:29:54PM +0900, Byungchul Park wrote:
> On Mon, Aug 12, 2019 at 09:12:34AM -0400, Joel Fernandes wrote:
> > On Mon, Aug 12, 2019 at 07:10:52PM +0900, Byungchul Park wrote:
> > > On Sun, Aug 11, 2019 at 04:49:39PM -0700, Paul E. McKenney wrote:
> > > > Maybe. Note well that I said "potential issue". When I checked a few
> > > > years ago, none of the uses of rcu_barrier() cared about kfree_rcu().
> > > > They cared instead about call_rcu() callbacks that accessed code or data
> > > > that was going to disappear soon, for example, due to module unload or
> > > > filesystem unmount.
> > > >
> > > > So it -might- be that rcu_barrier() can stay as it is, but with changes
> > > > as needed to documentation.
> >
> > Right, we should update the docs. Byungchul, do you mind sending a patch that
> > documents the rcu_barrier() behavior?
>
> Are you trying to give me the chance? I feel thankful. It doens't matter
> to try it at the moment though, I can't follow-up until September. I'd
> better do that in Septamber or give it up this time.
Which reminds me... I recall your asking if the kfree_rcu() patch
might be sensitive to the exact hardware, but I cannot locate that
email right off-hand. This is an excellent question! When faced with
floods of kfree_rcu() calls, I would expect some hardware, compiler,
and kernel-configuration sensitivity. Which is why it will likely be
necessary to do a few more improvements over time -- for but one example,
accumulating callbacks into vectors in order to reduce the number of
kfree()-time cache misses.
Thanx, Paul
> Thanks,
> Byungchul
>
> > > > It also -might- be, maybe now or maybe some time in the future, that
> > > > there will need to be a kfree_rcu_barrier() or some such. But if so,
> > > > let's not create it until it is needed. For one thing, it is reasonably
> > > > likely that something other than a kfree_rcu_barrier() would really
> > > > be what was needed. After all, the main point would be to make sure
> > > > that the old memory really was freed before allocating new memory.
> > >
> > > Now I fully understand what you meant thanks to you. Thank you for
> > > explaining it in detail.
> > >
> > > > But if the system had ample memory, why wait? In that case you don't
> > > > really need to wait for all the old memory to be freed, but rather for
> > > > sufficient memory to be available for allocation.
> > >
> > > Agree. Totally make sense.
> >
> > Agreed, all makes sense.
> >
> > thanks,
> >
> > - Joel
> >
> > [snip]
>
On Tue, Aug 13, 2019 at 08:41:45AM -0700, Paul E. McKenney wrote:
> On Tue, Aug 13, 2019 at 02:29:54PM +0900, Byungchul Park wrote:
> > On Mon, Aug 12, 2019 at 09:12:34AM -0400, Joel Fernandes wrote:
> > > On Mon, Aug 12, 2019 at 07:10:52PM +0900, Byungchul Park wrote:
> > > > On Sun, Aug 11, 2019 at 04:49:39PM -0700, Paul E. McKenney wrote:
> > > > > Maybe. Note well that I said "potential issue". When I checked a few
> > > > > years ago, none of the uses of rcu_barrier() cared about kfree_rcu().
> > > > > They cared instead about call_rcu() callbacks that accessed code or data
> > > > > that was going to disappear soon, for example, due to module unload or
> > > > > filesystem unmount.
> > > > >
> > > > > So it -might- be that rcu_barrier() can stay as it is, but with changes
> > > > > as needed to documentation.
> > >
> > > Right, we should update the docs. Byungchul, do you mind sending a patch that
> > > documents the rcu_barrier() behavior?
> >
> > Are you trying to give me the chance? I feel thankful. It doens't matter
> > to try it at the moment though, I can't follow-up until September. I'd
> > better do that in Septamber or give it up this time.
>
> Which reminds me... I recall your asking if the kfree_rcu() patch
> might be sensitive to the exact hardware, but I cannot locate that
> email right off-hand. This is an excellent question! When faced with
> floods of kfree_rcu() calls, I would expect some hardware, compiler,
> and kernel-configuration sensitivity. Which is why it will likely be
Yes.
> necessary to do a few more improvements over time -- for but one example,
> accumulating callbacks into vectors in order to reduce the number of
> kfree()-time cache misses.
Yes. That would be a pretty good way to mitigate the problem. I hope
the simple way we've done works well enough so it would never happen
though.
Or I would check the condition of all system resourses e.g. CPU and
memory and control the bandwith of them, of course only if that actually
happens.
Thanks a lot for sharing your opinion on it!
Thanks,
Byungchul
> Thanx, Paul
>
> > Thanks,
> > Byungchul
> >
> > > > > It also -might- be, maybe now or maybe some time in the future, that
> > > > > there will need to be a kfree_rcu_barrier() or some such. But if so,
> > > > > let's not create it until it is needed. For one thing, it is reasonably
> > > > > likely that something other than a kfree_rcu_barrier() would really
> > > > > be what was needed. After all, the main point would be to make sure
> > > > > that the old memory really was freed before allocating new memory.
> > > >
> > > > Now I fully understand what you meant thanks to you. Thank you for
> > > > explaining it in detail.
> > > >
> > > > > But if the system had ample memory, why wait? In that case you don't
> > > > > really need to wait for all the old memory to be freed, but rather for
> > > > > sufficient memory to be available for allocation.
> > > >
> > > > Agree. Totally make sense.
> > >
> > > Agreed, all makes sense.
> > >
> > > thanks,
> > >
> > > - Joel
> > >
> > > [snip]
> >
On Wed, Aug 14, 2019 at 09:11:03AM +0900, Byungchul Park wrote:
> On Tue, Aug 13, 2019 at 08:41:45AM -0700, Paul E. McKenney wrote:
> > On Tue, Aug 13, 2019 at 02:29:54PM +0900, Byungchul Park wrote:
> > > On Mon, Aug 12, 2019 at 09:12:34AM -0400, Joel Fernandes wrote:
> > > > On Mon, Aug 12, 2019 at 07:10:52PM +0900, Byungchul Park wrote:
> > > > > On Sun, Aug 11, 2019 at 04:49:39PM -0700, Paul E. McKenney wrote:
> > > > > > Maybe. Note well that I said "potential issue". When I checked a few
> > > > > > years ago, none of the uses of rcu_barrier() cared about kfree_rcu().
> > > > > > They cared instead about call_rcu() callbacks that accessed code or data
> > > > > > that was going to disappear soon, for example, due to module unload or
> > > > > > filesystem unmount.
> > > > > >
> > > > > > So it -might- be that rcu_barrier() can stay as it is, but with changes
> > > > > > as needed to documentation.
> > > >
> > > > Right, we should update the docs. Byungchul, do you mind sending a patch that
> > > > documents the rcu_barrier() behavior?
> > >
> > > Are you trying to give me the chance? I feel thankful. It doens't matter
> > > to try it at the moment though, I can't follow-up until September. I'd
> > > better do that in Septamber or give it up this time.
> >
> > Which reminds me... I recall your asking if the kfree_rcu() patch
> > might be sensitive to the exact hardware, but I cannot locate that
> > email right off-hand. This is an excellent question! When faced with
> > floods of kfree_rcu() calls, I would expect some hardware, compiler,
> > and kernel-configuration sensitivity. Which is why it will likely be
>
> Yes.
>
> > necessary to do a few more improvements over time -- for but one example,
> > accumulating callbacks into vectors in order to reduce the number of
> > kfree()-time cache misses.
>
> Yes. That would be a pretty good way to mitigate the problem. I hope
> the simple way we've done works well enough so it would never happen
> though.
>
> Or I would check the condition of all system resourses e.g. CPU and
> memory and control the bandwith of them, of course only if that actually
> happens.
>
> Thanks a lot for sharing your opinion on it!
Didn't you say earlier that you were getting OOM on your system even
with the patches? Or did I miss the resolution of that issue?
Thanx, Paul
> Thanks,
> Byungchul
>
> > Thanx, Paul
> >
> > > Thanks,
> > > Byungchul
> > >
> > > > > > It also -might- be, maybe now or maybe some time in the future, that
> > > > > > there will need to be a kfree_rcu_barrier() or some such. But if so,
> > > > > > let's not create it until it is needed. For one thing, it is reasonably
> > > > > > likely that something other than a kfree_rcu_barrier() would really
> > > > > > be what was needed. After all, the main point would be to make sure
> > > > > > that the old memory really was freed before allocating new memory.
> > > > >
> > > > > Now I fully understand what you meant thanks to you. Thank you for
> > > > > explaining it in detail.
> > > > >
> > > > > > But if the system had ample memory, why wait? In that case you don't
> > > > > > really need to wait for all the old memory to be freed, but rather for
> > > > > > sufficient memory to be available for allocation.
> > > > >
> > > > > Agree. Totally make sense.
> > > >
> > > > Agreed, all makes sense.
> > > >
> > > > thanks,
> > > >
> > > > - Joel
> > > >
> > > > [snip]
> > >
>
On Tue, Aug 13, 2019 at 07:53:49PM -0700, Paul E. McKenney wrote:
> On Wed, Aug 14, 2019 at 09:11:03AM +0900, Byungchul Park wrote:
> > On Tue, Aug 13, 2019 at 08:41:45AM -0700, Paul E. McKenney wrote:
> > > On Tue, Aug 13, 2019 at 02:29:54PM +0900, Byungchul Park wrote:
> > > > On Mon, Aug 12, 2019 at 09:12:34AM -0400, Joel Fernandes wrote:
> > > > > On Mon, Aug 12, 2019 at 07:10:52PM +0900, Byungchul Park wrote:
> > > > > > On Sun, Aug 11, 2019 at 04:49:39PM -0700, Paul E. McKenney wrote:
> > > > > > > Maybe. Note well that I said "potential issue". When I checked a few
> > > > > > > years ago, none of the uses of rcu_barrier() cared about kfree_rcu().
> > > > > > > They cared instead about call_rcu() callbacks that accessed code or data
> > > > > > > that was going to disappear soon, for example, due to module unload or
> > > > > > > filesystem unmount.
> > > > > > >
> > > > > > > So it -might- be that rcu_barrier() can stay as it is, but with changes
> > > > > > > as needed to documentation.
> > > > >
> > > > > Right, we should update the docs. Byungchul, do you mind sending a patch that
> > > > > documents the rcu_barrier() behavior?
> > > >
> > > > Are you trying to give me the chance? I feel thankful. It doens't matter
> > > > to try it at the moment though, I can't follow-up until September. I'd
> > > > better do that in Septamber or give it up this time.
> > >
> > > Which reminds me... I recall your asking if the kfree_rcu() patch
> > > might be sensitive to the exact hardware, but I cannot locate that
> > > email right off-hand. This is an excellent question! When faced with
> > > floods of kfree_rcu() calls, I would expect some hardware, compiler,
> > > and kernel-configuration sensitivity. Which is why it will likely be
> >
> > Yes.
> >
> > > necessary to do a few more improvements over time -- for but one example,
> > > accumulating callbacks into vectors in order to reduce the number of
> > > kfree()-time cache misses.
> >
> > Yes. That would be a pretty good way to mitigate the problem. I hope
> > the simple way we've done works well enough so it would never happen
> > though.
> >
> > Or I would check the condition of all system resourses e.g. CPU and
> > memory and control the bandwith of them, of course only if that actually
> > happens.
> >
> > Thanks a lot for sharing your opinion on it!
>
> Didn't you say earlier that you were getting OOM on your system even
> with the patches? Or did I miss the resolution of that issue?
I said I saw OOM with a *larger* value of KFREE_DRAIN_JIFFIES. It was
fine with the patch itself.
Anyway I'm sorry I expressed it in a confusing way.
Thanks,
Byungchul
>
> Thanx, Paul
>
> > Thanks,
> > Byungchul
> >
> > > Thanx, Paul
> > >
> > > > Thanks,
> > > > Byungchul
> > > >
> > > > > > > It also -might- be, maybe now or maybe some time in the future, that
> > > > > > > there will need to be a kfree_rcu_barrier() or some such. But if so,
> > > > > > > let's not create it until it is needed. For one thing, it is reasonably
> > > > > > > likely that something other than a kfree_rcu_barrier() would really
> > > > > > > be what was needed. After all, the main point would be to make sure
> > > > > > > that the old memory really was freed before allocating new memory.
> > > > > >
> > > > > > Now I fully understand what you meant thanks to you. Thank you for
> > > > > > explaining it in detail.
> > > > > >
> > > > > > > But if the system had ample memory, why wait? In that case you don't
> > > > > > > really need to wait for all the old memory to be freed, but rather for
> > > > > > > sufficient memory to be available for allocation.
> > > > > >
> > > > > > Agree. Totally make sense.
> > > > >
> > > > > Agreed, all makes sense.
> > > > >
> > > > > thanks,
> > > > >
> > > > > - Joel
> > > > >
> > > > > [snip]
> > > >
> >
On Wed, Aug 14, 2019 at 12:43:10PM +0900, Byungchul Park wrote:
> On Tue, Aug 13, 2019 at 07:53:49PM -0700, Paul E. McKenney wrote:
> > On Wed, Aug 14, 2019 at 09:11:03AM +0900, Byungchul Park wrote:
> > > On Tue, Aug 13, 2019 at 08:41:45AM -0700, Paul E. McKenney wrote:
> > > > On Tue, Aug 13, 2019 at 02:29:54PM +0900, Byungchul Park wrote:
> > > > > On Mon, Aug 12, 2019 at 09:12:34AM -0400, Joel Fernandes wrote:
> > > > > > On Mon, Aug 12, 2019 at 07:10:52PM +0900, Byungchul Park wrote:
> > > > > > > On Sun, Aug 11, 2019 at 04:49:39PM -0700, Paul E. McKenney wrote:
> > > > > > > > Maybe. Note well that I said "potential issue". When I checked a few
> > > > > > > > years ago, none of the uses of rcu_barrier() cared about kfree_rcu().
> > > > > > > > They cared instead about call_rcu() callbacks that accessed code or data
> > > > > > > > that was going to disappear soon, for example, due to module unload or
> > > > > > > > filesystem unmount.
> > > > > > > >
> > > > > > > > So it -might- be that rcu_barrier() can stay as it is, but with changes
> > > > > > > > as needed to documentation.
> > > > > >
> > > > > > Right, we should update the docs. Byungchul, do you mind sending a patch that
> > > > > > documents the rcu_barrier() behavior?
> > > > >
> > > > > Are you trying to give me the chance? I feel thankful. It doens't matter
> > > > > to try it at the moment though, I can't follow-up until September. I'd
> > > > > better do that in Septamber or give it up this time.
> > > >
> > > > Which reminds me... I recall your asking if the kfree_rcu() patch
> > > > might be sensitive to the exact hardware, but I cannot locate that
> > > > email right off-hand. This is an excellent question! When faced with
> > > > floods of kfree_rcu() calls, I would expect some hardware, compiler,
> > > > and kernel-configuration sensitivity. Which is why it will likely be
> > >
> > > Yes.
> > >
> > > > necessary to do a few more improvements over time -- for but one example,
> > > > accumulating callbacks into vectors in order to reduce the number of
> > > > kfree()-time cache misses.
> > >
> > > Yes. That would be a pretty good way to mitigate the problem. I hope
> > > the simple way we've done works well enough so it would never happen
> > > though.
> > >
> > > Or I would check the condition of all system resourses e.g. CPU and
> > > memory and control the bandwith of them, of course only if that actually
> > > happens.
> > >
> > > Thanks a lot for sharing your opinion on it!
> >
> > Didn't you say earlier that you were getting OOM on your system even
> > with the patches? Or did I miss the resolution of that issue?
>
> I said I saw OOM with a *larger* value of KFREE_DRAIN_JIFFIES. It was
> fine with the patch itself.
Got it, thank you!
> Anyway I'm sorry I expressed it in a confusing way.
But what is life without a little confusion? ;-)
Thanx, Paul