2020-10-29 16:52:26

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

The current memmory-allocation interface presents to following
difficulties that this patch is designed to overcome:

a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
complain about violation("BUG: Invalid wait context") of the
nesting rules. It does the raw_spinlock vs. spinlock nesting
checks, i.e. it is not legal to acquire a spinlock_t while
holding a raw_spinlock_t.

Internally the kfree_rcu() uses raw_spinlock_t whereas the
"page allocator" internally deals with spinlock_t to access
to its zones. The code also can be broken from higher level
of view:
<snip>
raw_spin_lock(&some_lock);
kfree_rcu(some_pointer, some_field_offset);
<snip>

b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
is converted into sleepable variant. Invoking the page allocator from
atomic contexts leads to "BUG: scheduling while atomic".

c) call_rcu() is invoked from raw atomic context and kfree_rcu()
and kvfree_rcu() are expected to be called from atomic raw context
as well.

Move out a page allocation from contexts which trigger kvfree_rcu()
function to the separate worker. When a k[v]free_rcu() per-cpu page
cache is empty a fallback mechanism is used and a special job is
scheduled to refill the per-cpu cache.

Link: https://lore.kernel.org/lkml/[email protected]/
Fixes: 3042f83f19be ("rcu: Support reclaim for head-less object")
Reported-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
kernel/rcu/tree.c | 109 ++++++++++++++++++++++++++++------------------
1 file changed, 66 insertions(+), 43 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 06895ef85d69..f2da2a1cc716 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
* per-CPU. Object size is equal to one page. This value
* can be changed at boot time.
*/
-static int rcu_min_cached_objs = 2;
+static int rcu_min_cached_objs = 5;
module_param(rcu_min_cached_objs, int, 0444);

/* Retrieve RCU kthreads priority for rcutorture */
@@ -3084,6 +3084,9 @@ struct kfree_rcu_cpu_work {
* In order to save some per-cpu space the list is singular.
* Even though it is lockless an access has to be protected by the
* per-cpu lock.
+ * @page_cache_work: A work to refill the cache when it is empty
+ * @work_in_progress: Indicates that page_cache_work is running
+ * @hrtimer: A hrtimer for scheduling a page_cache_work
* @nr_bkv_objs: number of allocated objects at @bkvcache.
*
* This is a per-CPU structure. The reason that it is not included in
@@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
bool monitor_todo;
bool initialized;
int count;
+
+ struct work_struct page_cache_work;
+ atomic_t work_in_progress;
+ struct hrtimer hrtimer;
+
struct llist_head bkvcache;
int nr_bkv_objs;
};
@@ -3217,10 +3225,10 @@ static void kfree_rcu_work(struct work_struct *work)
}
rcu_lock_release(&rcu_callback_map);

- krcp = krc_this_cpu_lock(&flags);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
if (put_cached_bnode(krcp, bkvhead[i]))
bkvhead[i] = NULL;
- krc_this_cpu_unlock(krcp, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);

if (bkvhead[i])
free_page((unsigned long) bkvhead[i]);
@@ -3347,6 +3355,57 @@ static void kfree_rcu_monitor(struct work_struct *work)
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}

+static enum hrtimer_restart
+schedule_page_work_fn(struct hrtimer *t)
+{
+ struct kfree_rcu_cpu *krcp =
+ container_of(t, struct kfree_rcu_cpu, hrtimer);
+
+ queue_work(system_highpri_wq, &krcp->page_cache_work);
+ return HRTIMER_NORESTART;
+}
+
+static void fill_page_cache_func(struct work_struct *work)
+{
+ struct kvfree_rcu_bulk_data *bnode;
+ struct kfree_rcu_cpu *krcp =
+ container_of(work, struct kfree_rcu_cpu,
+ page_cache_work);
+ unsigned long flags;
+ bool pushed;
+ int i;
+
+ for (i = 0; i < rcu_min_cached_objs; i++) {
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NOWARN);
+
+ if (bnode) {
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ pushed = put_cached_bnode(krcp, bnode);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (!pushed) {
+ free_page((unsigned long) bnode);
+ break;
+ }
+ }
+ }
+
+ atomic_set(&krcp->work_in_progress, 0);
+}
+
+static void
+run_page_cache_worker(struct kfree_rcu_cpu *krcp)
+{
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+ !atomic_xchg(&krcp->work_in_progress, 1)) {
+ hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ krcp->hrtimer.function = schedule_page_work_fn;
+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ }
+}
+
static inline bool
kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
{
@@ -3363,32 +3422,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
if (!krcp->bkvhead[idx] ||
krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
bnode = get_cached_bnode(krcp);
- if (!bnode) {
- /*
- * To keep this path working on raw non-preemptible
- * sections, prevent the optional entry into the
- * allocator as it uses sleeping locks. In fact, even
- * if the caller of kfree_rcu() is preemptible, this
- * path still is not, as krcp->lock is a raw spinlock.
- * With additional page pre-allocation in the works,
- * hitting this return is going to be much less likely.
- */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- return false;
-
- /*
- * NOTE: For one argument of kvfree_rcu() we can
- * drop the lock and get the page in sleepable
- * context. That would allow to maintain an array
- * for the CONFIG_PREEMPT_RT as well if no cached
- * pages are available.
- */
- bnode = (struct kvfree_rcu_bulk_data *)
- __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
- }
-
/* Switch to emergency path. */
- if (unlikely(!bnode))
+ if (!bnode)
return false;

/* Initialize the new block. */
@@ -3452,12 +3487,10 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
goto unlock_return;
}

- /*
- * Under high memory pressure GFP_NOWAIT can fail,
- * in that case the emergency path is maintained.
- */
success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
if (!success) {
+ run_page_cache_worker(krcp);
+
if (head == NULL)
// Inline if kvfree_rcu(one_arg) call.
goto unlock_return;
@@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)

for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- struct kvfree_rcu_bulk_data *bnode;

for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
}

- for (i = 0; i < rcu_min_cached_objs; i++) {
- bnode = (struct kvfree_rcu_bulk_data *)
- __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
-
- if (bnode)
- put_cached_bnode(krcp, bnode);
- else
- pr_err("Failed to preallocate for %d CPU!\n", cpu);
- }
-
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+ INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
if (register_shrinker(&kfree_rcu_shrinker))
--
2.20.1


2020-10-29 16:52:31

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 02/16] lib/debug: Remove pointless ARCH_NO_PREEMPT dependencies

From: Thomas Gleixner <[email protected]>

ARCH_NO_PREEMPT disables the selection of CONFIG_PREEMPT_VOLUNTARY and
CONFIG_PREEMPT, but architectures which set this config option still
support preempt count for hard and softirq accounting.

There is absolutely no reason to prevent lockdep from using the preempt
counter nor is there a reason to prevent the enablement of
CONFIG_DEBUG_ATOMIC_SLEEP on such architectures.

Remove the dependencies.

Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
lib/Kconfig.debug | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d7a7bc3b6098..89c9a177fb9b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1159,7 +1159,7 @@ config PROVE_LOCKING
select DEBUG_RWSEMS
select DEBUG_WW_MUTEX_SLOWPATH
select DEBUG_LOCK_ALLOC
- select PREEMPT_COUNT if !ARCH_NO_PREEMPT
+ select PREEMPT_COUNT
select TRACE_IRQFLAGS
default n
help
@@ -1321,7 +1321,6 @@ config DEBUG_ATOMIC_SLEEP
bool "Sleep inside atomic section checking"
select PREEMPT_COUNT
depends on DEBUG_KERNEL
- depends on !ARCH_NO_PREEMPT
help
If you say Y here, various routines which may sleep will become very
noisy if they are called inside atomic sections: when a spinlock is
--
2.20.1

2020-10-29 16:52:36

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 06/16] mm/pagemap: Cleanup PREEMPT_COUNT leftovers

From: Thomas Gleixner <[email protected]>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: [email protected]
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
include/linux/pagemap.h | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c77b7c31b2e4..cbfbe2bcca75 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -204,9 +204,7 @@ void release_pages(struct page **pages, int nr);
static inline int __page_cache_add_speculative(struct page *page, int count)
{
#ifdef CONFIG_TINY_RCU
-# ifdef CONFIG_PREEMPT_COUNT
- VM_BUG_ON(!in_atomic() && !irqs_disabled());
-# endif
+ VM_BUG_ON(preemptible())
/*
* Preempt must be disabled here - we rely on rcu_read_lock doing
* this for us.
--
2.20.1

2020-10-29 16:52:40

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 07/16] locking/bitspinlock: Cleanup PREEMPT_COUNT leftovers

From: Thomas Gleixner <[email protected]>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Will Deacon <[email protected]>
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
include/linux/bit_spinlock.h | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h
index bbc4730a6505..1e03d54b0b6f 100644
--- a/include/linux/bit_spinlock.h
+++ b/include/linux/bit_spinlock.h
@@ -90,10 +90,8 @@ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
return test_bit(bitnum, addr);
-#elif defined CONFIG_PREEMPT_COUNT
- return preempt_count();
#else
- return 1;
+ return preempt_count();
#endif
}

--
2.20.1

2020-10-29 16:52:51

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 10/16] ARM: Cleanup PREEMPT_COUNT leftovers

From: Thomas Gleixner <[email protected]>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Russell King <[email protected]>
Cc: [email protected]
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
arch/arm/include/asm/assembler.h | 11 -----------
arch/arm/kernel/iwmmxt.S | 2 --
arch/arm/mach-ep93xx/crunch-bits.S | 2 --
3 files changed, 15 deletions(-)

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index feac2c8b86f2..fce52eedc6e1 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -212,7 +212,6 @@
/*
* Increment/decrement the preempt count.
*/
-#ifdef CONFIG_PREEMPT_COUNT
.macro inc_preempt_count, ti, tmp
ldr \tmp, [\ti, #TI_PREEMPT] @ get preempt count
add \tmp, \tmp, #1 @ increment it
@@ -229,16 +228,6 @@
get_thread_info \ti
dec_preempt_count \ti, \tmp
.endm
-#else
- .macro inc_preempt_count, ti, tmp
- .endm
-
- .macro dec_preempt_count, ti, tmp
- .endm
-
- .macro dec_preempt_count_ti, ti, tmp
- .endm
-#endif

#define USERL(l, x...) \
9999: x; \
diff --git a/arch/arm/kernel/iwmmxt.S b/arch/arm/kernel/iwmmxt.S
index 0dcae787b004..1f845be78c43 100644
--- a/arch/arm/kernel/iwmmxt.S
+++ b/arch/arm/kernel/iwmmxt.S
@@ -94,9 +94,7 @@ ENTRY(iwmmxt_task_enable)
mov r2, r2 @ cpwait
bl concan_save

-#ifdef CONFIG_PREEMPT_COUNT
get_thread_info r10
-#endif
4: dec_preempt_count r10, r3
ret r9 @ normal exit from exception

diff --git a/arch/arm/mach-ep93xx/crunch-bits.S b/arch/arm/mach-ep93xx/crunch-bits.S
index fb2dbf76f09e..0aabcf4ebe8e 100644
--- a/arch/arm/mach-ep93xx/crunch-bits.S
+++ b/arch/arm/mach-ep93xx/crunch-bits.S
@@ -191,9 +191,7 @@ crunch_load:
cfldr64 mvdx15, [r0, #CRUNCH_MVDX15]

1:
-#ifdef CONFIG_PREEMPT_COUNT
get_thread_info r10
-#endif
2: dec_preempt_count r10, r3
ret lr

--
2.20.1

2020-10-29 16:53:03

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache

A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
can be obtained directly from a kvfree_rcu() path. To distinguish
that and take a decision the preemptable() macro is used when it
is save to enter allocator.

It means that refilling a cache is not important from timing point
of view. Switch to a delayed work, so the actual work is queued from
the timer interrupt with 1 jiffy delay. An immediate placing a task
on a current CPU can lead to rq->lock double lock. That is why a
delayed method is in place.

Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
kernel/rcu/tree.c | 26 +++++---------------------
1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3f9b016a44dc..9ea55f800b4b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3086,7 +3086,6 @@ struct kfree_rcu_cpu_work {
* per-cpu lock.
* @page_cache_work: A work to refill the cache when it is empty
* @work_in_progress: Indicates that page_cache_work is running
- * @hrtimer: A hrtimer for scheduling a page_cache_work
* @nr_bkv_objs: number of allocated objects at @bkvcache.
*
* This is a per-CPU structure. The reason that it is not included in
@@ -3104,9 +3103,8 @@ struct kfree_rcu_cpu {
bool initialized;
int count;

- struct work_struct page_cache_work;
+ struct delayed_work page_cache_work;
atomic_t work_in_progress;
- struct hrtimer hrtimer;

struct llist_head bkvcache;
int nr_bkv_objs;
@@ -3355,22 +3353,12 @@ static void kfree_rcu_monitor(struct work_struct *work)
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}

-static enum hrtimer_restart
-schedule_page_work_fn(struct hrtimer *t)
-{
- struct kfree_rcu_cpu *krcp =
- container_of(t, struct kfree_rcu_cpu, hrtimer);
-
- queue_work(system_highpri_wq, &krcp->page_cache_work);
- return HRTIMER_NORESTART;
-}
-
static void fill_page_cache_func(struct work_struct *work)
{
struct kvfree_rcu_bulk_data *bnode;
struct kfree_rcu_cpu *krcp =
container_of(work, struct kfree_rcu_cpu,
- page_cache_work);
+ page_cache_work.work);
unsigned long flags;
bool pushed;
int i;
@@ -3398,12 +3386,8 @@ static void
run_page_cache_worker(struct kfree_rcu_cpu *krcp)
{
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
- !atomic_xchg(&krcp->work_in_progress, 1)) {
- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
- krcp->hrtimer.function = schedule_page_work_fn;
- hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
- }
+ !atomic_xchg(&krcp->work_in_progress, 1))
+ schedule_delayed_work(&krcp->page_cache_work, 1);
}

// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
@@ -4503,7 +4487,7 @@ static void __init kfree_rcu_batch_init(void)
}

INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
- INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
+ INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
if (register_shrinker(&kfree_rcu_shrinker))
--
2.20.1

2020-10-29 16:53:08

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 12/16] drm/i915: Cleanup PREEMPT_COUNT leftovers

From: Thomas Gleixner <[email protected]>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Jani Nikula <[email protected]>
Cc: Joonas Lahtinen <[email protected]>
Cc: Rodrigo Vivi <[email protected]>
Cc: David Airlie <[email protected]>
Cc: Daniel Vetter <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
drivers/gpu/drm/i915/Kconfig.debug | 1 -
drivers/gpu/drm/i915/i915_utils.h | 3 +--
2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig.debug b/drivers/gpu/drm/i915/Kconfig.debug
index 25cd9788a4d5..b149f76d3ccd 100644
--- a/drivers/gpu/drm/i915/Kconfig.debug
+++ b/drivers/gpu/drm/i915/Kconfig.debug
@@ -20,7 +20,6 @@ config DRM_I915_DEBUG
bool "Enable additional driver debugging"
depends on DRM_I915
select DEBUG_FS
- select PREEMPT_COUNT
select I2C_CHARDEV
select STACKDEPOT
select DRM_DP_AUX_CHARDEV
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 54773371e6bd..ecfed860b3f6 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -337,8 +337,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms)
(Wmax))
#define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000)

-/* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */
-#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT)
+#ifdef CONFIG_DRM_I915_DEBUG
# define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic())
#else
# define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0)
--
2.20.1

2020-10-29 16:53:10

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible

Given that CONFIG_PREEMPT_COUNT is unconditionally enabled by the
earlier commits in this series, the preemptible() macro now properly
detects preempt-disable code regions even in kernels built with
CONFIG_PREEMPT_NONE.

This commit therefore uses preemptible() to determine whether allocation
is possible at all for double-argument kvfree_rcu(). If !preemptible(),
then allocation is not possible, and kvfree_rcu() falls back to using
the less cache-friendly rcu_head approach. Even when preemptible(),
the caller might be involved in reclaim, so the GFP_ flags used by
double-argument kvfree_rcu() must avoid invoking reclaim processing.

Note that single-argument kvfree_rcu() must be invoked in sleepable
contexts, and that its fallback is the relatively high latency
synchronize_rcu(). Single-argument kvfree_rcu() therefore uses
GFP_KERNEL|__GFP_RETRY_MAYFAIL to allow limited sleeping within the
memory allocator.

[ paulmck: Add add_ptr_to_bulk_krc_lock header comment per Michal Hocko. ]
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree.c | 48 ++++++++++++++++++++++++++++++-----------------
1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f2da2a1cc716..3f9b016a44dc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3406,37 +3406,55 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
}
}

+// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
+// state specified by flags. If can_sleep is true, the caller must
+// be schedulable and not be holding any locks or mutexes that might be
+// acquired by the memory allocator or anything that it might invoke.
+// If !can_sleep, then if !preemptible() no allocation will be undertaken,
+// otherwise the allocation will use GFP_ATOMIC to avoid the remainder of
+// the aforementioned deadlock possibilities. Returns true if ptr was
+// successfully recorded, else the caller must use a fallback.
static inline bool
-kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
+add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
+ unsigned long *flags, void *ptr, bool can_sleep)
{
struct kvfree_rcu_bulk_data *bnode;
+ bool can_alloc_page = preemptible();
+ gfp_t gfp = (can_sleep ? GFP_KERNEL | __GFP_RETRY_MAYFAIL :
+ GFP_ATOMIC) | __GFP_NOWARN;
int idx;

- if (unlikely(!krcp->initialized))
+ *krcp = krc_this_cpu_lock(flags);
+ if (unlikely(!(*krcp)->initialized))
return false;

- lockdep_assert_held(&krcp->lock);
idx = !!is_vmalloc_addr(ptr);

/* Check if a new block is required. */
- if (!krcp->bkvhead[idx] ||
- krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
- bnode = get_cached_bnode(krcp);
- /* Switch to emergency path. */
+ if (!(*krcp)->bkvhead[idx] ||
+ (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
+ bnode = get_cached_bnode(*krcp);
+ if (!bnode && can_alloc_page) {
+ krc_this_cpu_unlock(*krcp, *flags);
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(gfp);
+ *krcp = krc_this_cpu_lock(flags);
+ }
+
if (!bnode)
return false;

/* Initialize the new block. */
bnode->nr_records = 0;
- bnode->next = krcp->bkvhead[idx];
+ bnode->next = (*krcp)->bkvhead[idx];

/* Attach it to the head. */
- krcp->bkvhead[idx] = bnode;
+ (*krcp)->bkvhead[idx] = bnode;
}

/* Finally insert. */
- krcp->bkvhead[idx]->records
- [krcp->bkvhead[idx]->nr_records++] = ptr;
+ (*krcp)->bkvhead[idx]->records
+ [(*krcp)->bkvhead[idx]->nr_records++] = ptr;

return true;
}
@@ -3474,20 +3492,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
ptr = (unsigned long *) func;
}

- krcp = krc_this_cpu_lock(&flags);
-
// Queue the object but don't yet schedule the batch.
if (debug_rcu_head_queue(ptr)) {
// Probable double kfree_rcu(), just leak.
WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
__func__, head);

- // Mark as success and leave.
- success = true;
- goto unlock_return;
+ return;
}

- success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
+ success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
if (!success) {
run_page_cache_worker(krcp);

--
2.20.1

2020-10-29 16:53:13

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 08/16] uaccess: Cleanup PREEMPT_COUNT leftovers

From: Thomas Gleixner <[email protected]>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
include/linux/uaccess.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index c7c6e8b8344d..d6473a72a336 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -275,9 +275,9 @@ static inline bool pagefault_disabled(void)
*
* This function should only be used by the fault handlers. Other users should
* stick to pagefault_disabled().
- * Please NEVER use preempt_disable() to disable the fault handler. With
- * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
- * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
+ *
+ * Please NEVER use preempt_disable() or local_irq_disable() to disable the
+ * fault handler.
*/
#define faulthandler_disabled() (pagefault_disabled() || in_atomic())

--
2.20.1

2020-10-29 16:53:35

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 14/16] preempt: Remove PREEMPT_COUNT from Kconfig

From: Thomas Gleixner <[email protected]>

All conditionals and irritations are gone.

Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
kernel/Kconfig.preempt | 3 ---
1 file changed, 3 deletions(-)

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 3f4712ff073b..120b63f0c55a 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -74,8 +74,5 @@ config PREEMPT_RT

endchoice

-config PREEMPT_COUNT
- def_bool y
-
config PREEMPTION
bool
--
2.20.1

2020-10-29 16:54:24

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 05/16] lockdep: Cleanup PREEMPT_COUNT leftovers

From: Thomas Gleixner <[email protected]>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Will Deacon <[email protected]>
Acked-by: Will Deacon <[email protected]>
[ Rezki: Adopted for 5.10.0-rc1 kernel. ]
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
include/linux/lockdep.h | 6 ++----
lib/Kconfig.debug | 1 -
2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index f5594879175a..d05db575f60f 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -580,16 +580,14 @@ do { \

#define lockdep_assert_preemption_enabled() \
do { \
- WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \
- __lockdep_enabled && \
+ WARN_ON_ONCE(__lockdep_enabled && \
(preempt_count() != 0 || \
!this_cpu_read(hardirqs_enabled))); \
} while (0)

#define lockdep_assert_preemption_disabled() \
do { \
- WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \
- __lockdep_enabled && \
+ WARN_ON_ONCE(__lockdep_enabled && \
(preempt_count() == 0 && \
this_cpu_read(hardirqs_enabled))); \
} while (0)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 89c9a177fb9b..03a85065805e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1159,7 +1159,6 @@ config PROVE_LOCKING
select DEBUG_RWSEMS
select DEBUG_WW_MUTEX_SLOWPATH
select DEBUG_LOCK_ALLOC
- select PREEMPT_COUNT
select TRACE_IRQFLAGS
default n
help
--
2.20.1

2020-10-29 16:54:34

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 03/16] preempt: Make preempt count unconditional

From: Thomas Gleixner <[email protected]>

The handling of preempt_count() is inconsistent accross kernel
configurations. On kernels which have PREEMPT_COUNT=n
preempt_disable/enable() and the lock/unlock functions are not affecting
the preempt count, only local_bh_disable/enable() and _bh variants of
locking, soft interrupt delivery, hard interrupt and NMI context affect it.

It's therefore impossible to have a consistent set of checks which provide
information about the context in which a function is called. In many cases
it makes sense to have seperate functions for seperate contexts, but there
are valid reasons to avoid that and handle different calling contexts
conditionally.

The lack of such indicators which work on all kernel configuratios is a
constant source of trouble because developers either do not understand the
implications or try to work around this inconsistency in weird
ways. Neither seem these issues be catched by reviewers and testing.

Recently merged code does:

gfp = preemptible() ? GFP_KERNEL : GFP_ATOMIC;

Looks obviously correct, except for the fact that preemptible() is
unconditionally false for CONFIF_PREEMPT_COUNT=n, i.e. all allocations in
that code use GFP_ATOMIC on such kernels.

Attempts to make preempt count unconditional and consistent have been
rejected in the past with handwaving performance arguments.

Freshly conducted benchmarks did not reveal any measurable impact from
enabling preempt count unconditionally. On kernels with CONFIG_PREEMPT_NONE
or CONFIG_PREEMPT_VOLUNTARY the preempt count is only incremented and
decremented but the result of the decrement is not tested. Contrary to that
enabling CONFIG_PREEMPT which tests the result has a small but measurable
impact due to the conditional branch/call.

It's about time to make essential functionality of the kernel consistent
accross the various preemption models.

Enable CONFIG_PREEMPT_COUNT unconditionally. Follow up changes will remove
the #ifdeffery and remove the config option at the end.

Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
kernel/Kconfig.preempt | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf82259cff96..3f4712ff073b 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -75,8 +75,7 @@ config PREEMPT_RT
endchoice

config PREEMPT_COUNT
- bool
+ def_bool y

config PREEMPTION
bool
- select PREEMPT_COUNT
--
2.20.1

2020-10-29 16:54:51

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 13/16] rcutorture: Cleanup PREEMPT_COUNT leftovers

From: Thomas Gleixner <[email protected]>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: Josh Triplett <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Mathieu Desnoyers <[email protected]>
Cc: Lai Jiangshan <[email protected]>
Cc: Shuah Khan <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
tools/testing/selftests/rcutorture/configs/rcu/SRCU-t | 1 -
tools/testing/selftests/rcutorture/configs/rcu/SRCU-u | 1 -
tools/testing/selftests/rcutorture/configs/rcu/TINY01 | 1 -
tools/testing/selftests/rcutorture/doc/TINY_RCU.txt | 5 ++---
tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 1 -
.../selftests/rcutorture/formal/srcu-cbmc/src/config.h | 1 -
6 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t
index 6c78022c8cd8..553cf6534e67 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t
@@ -7,4 +7,3 @@ CONFIG_RCU_TRACE=n
CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
CONFIG_DEBUG_ATOMIC_SLEEP=y
-#CHECK#CONFIG_PREEMPT_COUNT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
index c15ada821e45..99563da21732 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
@@ -7,4 +7,3 @@ CONFIG_RCU_TRACE=n
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_PROVE_LOCKING=y
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
-CONFIG_PREEMPT_COUNT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY01 b/tools/testing/selftests/rcutorture/configs/rcu/TINY01
index 6db705e55487..9b22b8e768d5 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TINY01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY01
@@ -10,4 +10,3 @@ CONFIG_RCU_TRACE=n
#CHECK#CONFIG_RCU_STALL_COMMON=n
CONFIG_DEBUG_LOCK_ALLOC=n
CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
-CONFIG_PREEMPT_COUNT=n
diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
index a75b16991a92..d30cedf07826 100644
--- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
+++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
@@ -3,11 +3,10 @@ This document gives a brief rationale for the TINY_RCU test cases.

Kconfig Parameters:

-CONFIG_DEBUG_LOCK_ALLOC -- Do all three and none of the three.
-CONFIG_PREEMPT_COUNT
+CONFIG_DEBUG_LOCK_ALLOC -- Do both and none of the two.
CONFIG_RCU_TRACE

-The theory here is that randconfig testing will hit the other six possible
+The theory here is that randconfig testing will hit the other two possible
combinations of these parameters.


diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 1b96d68473b8..cfdd48f689de 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -43,7 +43,6 @@ CONFIG_64BIT

Used only to check CONFIG_RCU_FANOUT value, inspection suffices.

-CONFIG_PREEMPT_COUNT
CONFIG_PREEMPT_RCU

Redundant with CONFIG_PREEMPT, ignore.
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h
index 283d7103334f..d0d485d48649 100644
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h
@@ -8,7 +8,6 @@
#undef CONFIG_HOTPLUG_CPU
#undef CONFIG_MODULES
#undef CONFIG_NO_HZ_FULL_SYSIDLE
-#undef CONFIG_PREEMPT_COUNT
#undef CONFIG_PREEMPT_RCU
#undef CONFIG_PROVE_RCU
#undef CONFIG_RCU_NOCB_CPU
--
2.20.1

2020-10-29 16:55:17

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 11/16] xtensa: Cleanup PREEMPT_COUNT leftovers

From: Thomas Gleixner <[email protected]>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Chris Zankel <[email protected]>
Cc: Max Filippov <[email protected]>
Cc: [email protected]
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
arch/xtensa/kernel/entry.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S
index 703cf6205efe..5a27dd34d3fc 100644
--- a/arch/xtensa/kernel/entry.S
+++ b/arch/xtensa/kernel/entry.S
@@ -819,7 +819,7 @@ ENTRY(debug_exception)
* preemption if we have HW breakpoints to preserve DEBUGCAUSE.DBNUM
* meaning.
*/
-#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_HAVE_HW_BREAKPOINT)
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
GET_THREAD_INFO(a2, a1)
l32i a3, a2, TI_PRE_COUNT
addi a3, a3, 1
--
2.20.1

2020-10-29 16:55:34

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 09/16] sched: Cleanup PREEMPT_COUNT leftovers

From: Thomas Gleixner <[email protected]>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Ben Segall <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Daniel Bristot de Oliveira <[email protected]>
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
kernel/sched/core.c | 6 +-----
lib/Kconfig.debug | 1 -
2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2003a7d5ab5..e172f2ddfa16 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3702,8 +3702,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
* finish_task_switch() for details.
*
* finish_task_switch() will drop rq->lock() and lower preempt_count
- * and the preempt_enable() will end up enabling preemption (on
- * PREEMPT_COUNT kernels).
+ * and the preempt_enable() will end up enabling preemption.
*/

rq = finish_task_switch(prev);
@@ -7307,9 +7306,6 @@ void __cant_sleep(const char *file, int line, int preempt_offset)
if (irqs_disabled())
return;

- if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
- return;
-
if (preempt_count() > preempt_offset)
return;

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 03a85065805e..d62806c81f6d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1318,7 +1318,6 @@ config DEBUG_LOCKDEP

config DEBUG_ATOMIC_SLEEP
bool "Sleep inside atomic section checking"
- select PREEMPT_COUNT
depends on DEBUG_KERNEL
help
If you say Y here, various routines which may sleep will become very
--
2.20.1

2020-10-29 16:56:27

by Uladzislau Rezki

[permalink] [raw]
Subject: [PATCH 04/16] preempt: Cleanup PREEMPT_COUNT leftovers

From: Thomas Gleixner <[email protected]>

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Ben Segall <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Daniel Bristot de Oliveira <[email protected]>
Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
---
include/linux/preempt.h | 37 ++++---------------------------------
1 file changed, 4 insertions(+), 33 deletions(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7d9c1c0e149c..513769b1edf8 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -56,8 +56,7 @@
#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)

/*
- * Disable preemption until the scheduler is running -- use an unconditional
- * value so that it also works on !PREEMPT_COUNT kernels.
+ * Disable preemption until the scheduler is running.
*
* Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
*/
@@ -69,7 +68,6 @@
*
* preempt_count() == 2*PREEMPT_DISABLE_OFFSET
*
- * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
* Note: See finish_task_switch().
*/
#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
@@ -106,11 +104,7 @@
/*
* The preempt_count offset after preempt_disable();
*/
-#if defined(CONFIG_PREEMPT_COUNT)
-# define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET
-#else
-# define PREEMPT_DISABLE_OFFSET 0
-#endif
+#define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET

/*
* The preempt_count offset after spin_lock()
@@ -122,8 +116,8 @@
*
* spin_lock_bh()
*
- * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
- * softirqs, such that unlock sequences of:
+ * Which need to disable both preemption and softirqs, such that unlock
+ * sequences of:
*
* spin_unlock();
* local_bh_enable();
@@ -164,8 +158,6 @@ extern void preempt_count_sub(int val);
#define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1)

-#ifdef CONFIG_PREEMPT_COUNT
-
#define preempt_disable() \
do { \
preempt_count_inc(); \
@@ -231,27 +223,6 @@ do { \
__preempt_count_dec(); \
} while (0)

-#else /* !CONFIG_PREEMPT_COUNT */
-
-/*
- * Even if we don't have any preemption, we need preempt disable/enable
- * to be barriers, so that we don't have things like get_user/put_user
- * that can cause faults and scheduling migrate into our preempt-protected
- * region.
- */
-#define preempt_disable() barrier()
-#define sched_preempt_enable_no_resched() barrier()
-#define preempt_enable_no_resched() barrier()
-#define preempt_enable() barrier()
-#define preempt_check_resched() do { } while (0)
-
-#define preempt_disable_notrace() barrier()
-#define preempt_enable_no_resched_notrace() barrier()
-#define preempt_enable_notrace() barrier()
-#define preemptible() 0
-
-#endif /* CONFIG_PREEMPT_COUNT */
-
#ifdef MODULE
/*
* Modules have no business playing preemption tricks.
--
2.20.1

2020-10-29 19:49:29

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache

On Thu, Oct 29, 2020 at 05:50:19PM +0100, Uladzislau Rezki (Sony) wrote:
> A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
> can be obtained directly from a kvfree_rcu() path. To distinguish
> that and take a decision the preemptable() macro is used when it
> is save to enter allocator.
>
> It means that refilling a cache is not important from timing point
> of view. Switch to a delayed work, so the actual work is queued from
> the timer interrupt with 1 jiffy delay. An immediate placing a task
> on a current CPU can lead to rq->lock double lock. That is why a
> delayed method is in place.
>
> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>

Thank you, Uladzislau!

I applied this on top of v5.10-rc1 and got the following from the
single-CPU builds:

SYNC include/config/auto.conf.cmd
DESCEND objtool
CC kernel/bounds.s
CALL scripts/atomic/check-atomics.sh
UPD include/generated/bounds.h
CC arch/x86/kernel/asm-offsets.s
In file included from ./include/asm-generic/atomic-instrumented.h:20:0,
from ./include/linux/atomic.h:82,
from ./include/linux/crypto.h:15,
from arch/x86/kernel/asm-offsets.c:9:
./include/linux/pagemap.h: In function ‘__page_cache_add_speculative’:
./include/linux/build_bug.h:30:34: error: called object is not a function or function pointer
#define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
./include/linux/mmdebug.h:45:25: note: in expansion of macro ‘BUILD_BUG_ON_INVALID’
#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
^~~~~~~~~~~~~~~~~~~~
./include/linux/pagemap.h:207:2: note: in expansion of macro ‘VM_BUG_ON’
VM_BUG_ON(preemptible())
^~~~~~~~~
scripts/Makefile.build:117: recipe for target 'arch/x86/kernel/asm-offsets.s' failed
make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1
Makefile:1199: recipe for target 'prepare0' failed
make: *** [prepare0] Error 2

I vaguely recall something like this showing up in the previous series
and that we did something or another to address it. Could you please
check against the old series at -rcu branch dev.2020.10.22a? (I verified
that the old series does run correctly in the single-CPU scenarios.)

Thanx, Paul

> ---
> kernel/rcu/tree.c | 26 +++++---------------------
> 1 file changed, 5 insertions(+), 21 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 3f9b016a44dc..9ea55f800b4b 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3086,7 +3086,6 @@ struct kfree_rcu_cpu_work {
> * per-cpu lock.
> * @page_cache_work: A work to refill the cache when it is empty
> * @work_in_progress: Indicates that page_cache_work is running
> - * @hrtimer: A hrtimer for scheduling a page_cache_work
> * @nr_bkv_objs: number of allocated objects at @bkvcache.
> *
> * This is a per-CPU structure. The reason that it is not included in
> @@ -3104,9 +3103,8 @@ struct kfree_rcu_cpu {
> bool initialized;
> int count;
>
> - struct work_struct page_cache_work;
> + struct delayed_work page_cache_work;
> atomic_t work_in_progress;
> - struct hrtimer hrtimer;
>
> struct llist_head bkvcache;
> int nr_bkv_objs;
> @@ -3355,22 +3353,12 @@ static void kfree_rcu_monitor(struct work_struct *work)
> raw_spin_unlock_irqrestore(&krcp->lock, flags);
> }
>
> -static enum hrtimer_restart
> -schedule_page_work_fn(struct hrtimer *t)
> -{
> - struct kfree_rcu_cpu *krcp =
> - container_of(t, struct kfree_rcu_cpu, hrtimer);
> -
> - queue_work(system_highpri_wq, &krcp->page_cache_work);
> - return HRTIMER_NORESTART;
> -}
> -
> static void fill_page_cache_func(struct work_struct *work)
> {
> struct kvfree_rcu_bulk_data *bnode;
> struct kfree_rcu_cpu *krcp =
> container_of(work, struct kfree_rcu_cpu,
> - page_cache_work);
> + page_cache_work.work);
> unsigned long flags;
> bool pushed;
> int i;
> @@ -3398,12 +3386,8 @@ static void
> run_page_cache_worker(struct kfree_rcu_cpu *krcp)
> {
> if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> - !atomic_xchg(&krcp->work_in_progress, 1)) {
> - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
> - HRTIMER_MODE_REL);
> - krcp->hrtimer.function = schedule_page_work_fn;
> - hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
> - }
> + !atomic_xchg(&krcp->work_in_progress, 1))
> + schedule_delayed_work(&krcp->page_cache_work, 1);
> }
>
> // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
> @@ -4503,7 +4487,7 @@ static void __init kfree_rcu_batch_init(void)
> }
>
> INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> - INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
> + INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
> krcp->initialized = true;
> }
> if (register_shrinker(&kfree_rcu_shrinker))
> --
> 2.20.1
>

2020-10-29 20:15:34

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache

On Thu, Oct 29, 2020 at 12:47:24PM -0700, Paul E. McKenney wrote:
> On Thu, Oct 29, 2020 at 05:50:19PM +0100, Uladzislau Rezki (Sony) wrote:
> > A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
> > can be obtained directly from a kvfree_rcu() path. To distinguish
> > that and take a decision the preemptable() macro is used when it
> > is save to enter allocator.
> >
> > It means that refilling a cache is not important from timing point
> > of view. Switch to a delayed work, so the actual work is queued from
> > the timer interrupt with 1 jiffy delay. An immediate placing a task
> > on a current CPU can lead to rq->lock double lock. That is why a
> > delayed method is in place.
> >
> > Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
>
> Thank you, Uladzislau!
>
> I applied this on top of v5.10-rc1 and got the following from the
> single-CPU builds:
>
> SYNC include/config/auto.conf.cmd
> DESCEND objtool
> CC kernel/bounds.s
> CALL scripts/atomic/check-atomics.sh
> UPD include/generated/bounds.h
> CC arch/x86/kernel/asm-offsets.s
> In file included from ./include/asm-generic/atomic-instrumented.h:20:0,
> from ./include/linux/atomic.h:82,
> from ./include/linux/crypto.h:15,
> from arch/x86/kernel/asm-offsets.c:9:
> ./include/linux/pagemap.h: In function ‘__page_cache_add_speculative’:
> ./include/linux/build_bug.h:30:34: error: called object is not a function or function pointer
> #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
> ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> ./include/linux/mmdebug.h:45:25: note: in expansion of macro ‘BUILD_BUG_ON_INVALID’
> #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
> ^~~~~~~~~~~~~~~~~~~~
> ./include/linux/pagemap.h:207:2: note: in expansion of macro ‘VM_BUG_ON’
> VM_BUG_ON(preemptible())
> ^~~~~~~~~
> scripts/Makefile.build:117: recipe for target 'arch/x86/kernel/asm-offsets.s' failed
> make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1
> Makefile:1199: recipe for target 'prepare0' failed
> make: *** [prepare0] Error 2
>
> I vaguely recall something like this showing up in the previous series
> and that we did something or another to address it. Could you please
> check against the old series at -rcu branch dev.2020.10.22a? (I verified
> that the old series does run correctly in the single-CPU scenarios.)
>
I see the same build error. Will double check if we have similar in the
previous series also. It looks like the error is caused by the Thomas series.

Will check!

--
Vlad Rezki

2020-10-29 20:24:44

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache

On Thu, Oct 29, 2020 at 09:13:42PM +0100, Uladzislau Rezki wrote:
> On Thu, Oct 29, 2020 at 12:47:24PM -0700, Paul E. McKenney wrote:
> > On Thu, Oct 29, 2020 at 05:50:19PM +0100, Uladzislau Rezki (Sony) wrote:
> > > A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
> > > can be obtained directly from a kvfree_rcu() path. To distinguish
> > > that and take a decision the preemptable() macro is used when it
> > > is save to enter allocator.
> > >
> > > It means that refilling a cache is not important from timing point
> > > of view. Switch to a delayed work, so the actual work is queued from
> > > the timer interrupt with 1 jiffy delay. An immediate placing a task
> > > on a current CPU can lead to rq->lock double lock. That is why a
> > > delayed method is in place.
> > >
> > > Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> >
> > Thank you, Uladzislau!
> >
> > I applied this on top of v5.10-rc1 and got the following from the
> > single-CPU builds:
> >
> > SYNC include/config/auto.conf.cmd
> > DESCEND objtool
> > CC kernel/bounds.s
> > CALL scripts/atomic/check-atomics.sh
> > UPD include/generated/bounds.h
> > CC arch/x86/kernel/asm-offsets.s
> > In file included from ./include/asm-generic/atomic-instrumented.h:20:0,
> > from ./include/linux/atomic.h:82,
> > from ./include/linux/crypto.h:15,
> > from arch/x86/kernel/asm-offsets.c:9:
> > ./include/linux/pagemap.h: In function ‘__page_cache_add_speculative’:
> > ./include/linux/build_bug.h:30:34: error: called object is not a function or function pointer
> > #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
> > ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > ./include/linux/mmdebug.h:45:25: note: in expansion of macro ‘BUILD_BUG_ON_INVALID’
> > #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
> > ^~~~~~~~~~~~~~~~~~~~
> > ./include/linux/pagemap.h:207:2: note: in expansion of macro ‘VM_BUG_ON’
> > VM_BUG_ON(preemptible())
> > ^~~~~~~~~
> > scripts/Makefile.build:117: recipe for target 'arch/x86/kernel/asm-offsets.s' failed
> > make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1
> > Makefile:1199: recipe for target 'prepare0' failed
> > make: *** [prepare0] Error 2
> >
> > I vaguely recall something like this showing up in the previous series
> > and that we did something or another to address it. Could you please
> > check against the old series at -rcu branch dev.2020.10.22a? (I verified
> > that the old series does run correctly in the single-CPU scenarios.)
> >
> I see the same build error. Will double check if we have similar in the
> previous series also. It looks like the error is caused by the Thomas series.
>
> Will check!
>
OK. Found it:

urezki@pc638:~/data/coding/linux.git$ git diff
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index cbfbe2bcca75..7dd3523093db 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -204,7 +204,7 @@ void release_pages(struct page **pages, int nr);
static inline int __page_cache_add_speculative(struct page *page, int count)
{
#ifdef CONFIG_TINY_RCU
- VM_BUG_ON(preemptible())
+ VM_BUG_ON(preemptible());
/*
* Preempt must be disabled here - we rely on rcu_read_lock doing
* this for us.
urezki@pc638:~/data/coding/linux.git$

I guess we had some extra patch that fixes a kernel compilation for !SMP
case. Will check dev.2020.10.22a.

--
Vlad Rezki

2020-10-29 20:35:49

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache

On Thu, Oct 29, 2020 at 09:22:41PM +0100, Uladzislau Rezki wrote:
> On Thu, Oct 29, 2020 at 09:13:42PM +0100, Uladzislau Rezki wrote:
> > On Thu, Oct 29, 2020 at 12:47:24PM -0700, Paul E. McKenney wrote:
> > > On Thu, Oct 29, 2020 at 05:50:19PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
> > > > can be obtained directly from a kvfree_rcu() path. To distinguish
> > > > that and take a decision the preemptable() macro is used when it
> > > > is save to enter allocator.
> > > >
> > > > It means that refilling a cache is not important from timing point
> > > > of view. Switch to a delayed work, so the actual work is queued from
> > > > the timer interrupt with 1 jiffy delay. An immediate placing a task
> > > > on a current CPU can lead to rq->lock double lock. That is why a
> > > > delayed method is in place.
> > > >
> > > > Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> > >
> > > Thank you, Uladzislau!
> > >
> > > I applied this on top of v5.10-rc1 and got the following from the
> > > single-CPU builds:
> > >
> > > SYNC include/config/auto.conf.cmd
> > > DESCEND objtool
> > > CC kernel/bounds.s
> > > CALL scripts/atomic/check-atomics.sh
> > > UPD include/generated/bounds.h
> > > CC arch/x86/kernel/asm-offsets.s
> > > In file included from ./include/asm-generic/atomic-instrumented.h:20:0,
> > > from ./include/linux/atomic.h:82,
> > > from ./include/linux/crypto.h:15,
> > > from arch/x86/kernel/asm-offsets.c:9:
> > > ./include/linux/pagemap.h: In function ‘__page_cache_add_speculative’:
> > > ./include/linux/build_bug.h:30:34: error: called object is not a function or function pointer
> > > #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
> > > ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > > ./include/linux/mmdebug.h:45:25: note: in expansion of macro ‘BUILD_BUG_ON_INVALID’
> > > #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
> > > ^~~~~~~~~~~~~~~~~~~~
> > > ./include/linux/pagemap.h:207:2: note: in expansion of macro ‘VM_BUG_ON’
> > > VM_BUG_ON(preemptible())
> > > ^~~~~~~~~
> > > scripts/Makefile.build:117: recipe for target 'arch/x86/kernel/asm-offsets.s' failed
> > > make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1
> > > Makefile:1199: recipe for target 'prepare0' failed
> > > make: *** [prepare0] Error 2
> > >
> > > I vaguely recall something like this showing up in the previous series
> > > and that we did something or another to address it. Could you please
> > > check against the old series at -rcu branch dev.2020.10.22a? (I verified
> > > that the old series does run correctly in the single-CPU scenarios.)
> > >
> > I see the same build error. Will double check if we have similar in the
> > previous series also. It looks like the error is caused by the Thomas series.
> >
> > Will check!
> >
> OK. Found it:
>
> urezki@pc638:~/data/coding/linux.git$ git diff
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index cbfbe2bcca75..7dd3523093db 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -204,7 +204,7 @@ void release_pages(struct page **pages, int nr);
> static inline int __page_cache_add_speculative(struct page *page, int count)
> {
> #ifdef CONFIG_TINY_RCU
> - VM_BUG_ON(preemptible())
> + VM_BUG_ON(preemptible());
> /*
> * Preempt must be disabled here - we rely on rcu_read_lock doing
> * this for us.
> urezki@pc638:~/data/coding/linux.git$
>
> I guess we had some extra patch that fixes a kernel compilation for !SMP
> case. Will check dev.2020.10.22a.

I guess I am blind today!

And yes, that semicolon is in the corresponding commit in -tip.

So, an important safety tip: When forward porting, start from the
commits that have been tested rather than from the original series
of patches. ;-)

Thanx, Paul

2020-10-29 21:00:08

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH 06/16] mm/pagemap: Cleanup PREEMPT_COUNT leftovers

On Thu, Oct 29, 2020 at 05:50:09PM +0100, Uladzislau Rezki (Sony) wrote:
> From: Thomas Gleixner <[email protected]>
>
> CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
> removed. Cleanup the leftovers before doing so.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: [email protected]
> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> ---
> include/linux/pagemap.h | 4 +---
> 1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index c77b7c31b2e4..cbfbe2bcca75 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -204,9 +204,7 @@ void release_pages(struct page **pages, int nr);
> static inline int __page_cache_add_speculative(struct page *page, int count)
> {
> #ifdef CONFIG_TINY_RCU
> -# ifdef CONFIG_PREEMPT_COUNT
> - VM_BUG_ON(!in_atomic() && !irqs_disabled());
> -# endif
> + VM_BUG_ON(preemptible())
> /*
> * Preempt must be disabled here - we rely on rcu_read_lock doing
> * this for us.
> --
> 2.20.1
>
Hello, Paul.

Sorry for a small mistake, it was fixed by you before, whereas i took an
old version of the patch that is question. Please use below one instead of
posted one:

Author: Thomas Gleixner <[email protected]>
Date: Mon Sep 14 19:25:00 2020 +0200

mm/pagemap: Cleanup PREEMPT_COUNT leftovers

CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
removed. Cleanup the leftovers before doing so.

Signed-off-by: Thomas Gleixner <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: [email protected]
[ paulmck: Fix !SMP build error per kernel test robot feedback. ]
Signed-off-by: Paul E. McKenney <[email protected]>

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7de11dcd534d..b3d9d9217ea0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -168,9 +168,7 @@ void release_pages(struct page **pages, int nr);
static inline int __page_cache_add_speculative(struct page *page, int count)
{
#ifdef CONFIG_TINY_RCU
-# ifdef CONFIG_PREEMPT_COUNT
- VM_BUG_ON(!in_atomic() && !irqs_disabled());
-# endif
+ VM_BUG_ON(preemptible());
/*
* Preempt must be disabled here - we rely on rcu_read_lock doing
* this for us.

--
Vlad Rezki

2020-10-29 21:02:27

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH 16/16] rcu/tree: Use delayed work instead of hrtimer to refill the cache

On Thu, Oct 29, 2020 at 01:33:55PM -0700, Paul E. McKenney wrote:
> On Thu, Oct 29, 2020 at 09:22:41PM +0100, Uladzislau Rezki wrote:
> > On Thu, Oct 29, 2020 at 09:13:42PM +0100, Uladzislau Rezki wrote:
> > > On Thu, Oct 29, 2020 at 12:47:24PM -0700, Paul E. McKenney wrote:
> > > > On Thu, Oct 29, 2020 at 05:50:19PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > > A CONFIG_PREEMPT_COUNT is unconditionally enabled, thus a page
> > > > > can be obtained directly from a kvfree_rcu() path. To distinguish
> > > > > that and take a decision the preemptable() macro is used when it
> > > > > is save to enter allocator.
> > > > >
> > > > > It means that refilling a cache is not important from timing point
> > > > > of view. Switch to a delayed work, so the actual work is queued from
> > > > > the timer interrupt with 1 jiffy delay. An immediate placing a task
> > > > > on a current CPU can lead to rq->lock double lock. That is why a
> > > > > delayed method is in place.
> > > > >
> > > > > Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> > > >
> > > > Thank you, Uladzislau!
> > > >
> > > > I applied this on top of v5.10-rc1 and got the following from the
> > > > single-CPU builds:
> > > >
> > > > SYNC include/config/auto.conf.cmd
> > > > DESCEND objtool
> > > > CC kernel/bounds.s
> > > > CALL scripts/atomic/check-atomics.sh
> > > > UPD include/generated/bounds.h
> > > > CC arch/x86/kernel/asm-offsets.s
> > > > In file included from ./include/asm-generic/atomic-instrumented.h:20:0,
> > > > from ./include/linux/atomic.h:82,
> > > > from ./include/linux/crypto.h:15,
> > > > from arch/x86/kernel/asm-offsets.c:9:
> > > > ./include/linux/pagemap.h: In function ‘__page_cache_add_speculative’:
> > > > ./include/linux/build_bug.h:30:34: error: called object is not a function or function pointer
> > > > #define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
> > > > ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > > > ./include/linux/mmdebug.h:45:25: note: in expansion of macro ‘BUILD_BUG_ON_INVALID’
> > > > #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
> > > > ^~~~~~~~~~~~~~~~~~~~
> > > > ./include/linux/pagemap.h:207:2: note: in expansion of macro ‘VM_BUG_ON’
> > > > VM_BUG_ON(preemptible())
> > > > ^~~~~~~~~
> > > > scripts/Makefile.build:117: recipe for target 'arch/x86/kernel/asm-offsets.s' failed
> > > > make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1
> > > > Makefile:1199: recipe for target 'prepare0' failed
> > > > make: *** [prepare0] Error 2
> > > >
> > > > I vaguely recall something like this showing up in the previous series
> > > > and that we did something or another to address it. Could you please
> > > > check against the old series at -rcu branch dev.2020.10.22a? (I verified
> > > > that the old series does run correctly in the single-CPU scenarios.)
> > > >
> > > I see the same build error. Will double check if we have similar in the
> > > previous series also. It looks like the error is caused by the Thomas series.
> > >
> > > Will check!
> > >
> > OK. Found it:
> >
> > urezki@pc638:~/data/coding/linux.git$ git diff
> > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> > index cbfbe2bcca75..7dd3523093db 100644
> > --- a/include/linux/pagemap.h
> > +++ b/include/linux/pagemap.h
> > @@ -204,7 +204,7 @@ void release_pages(struct page **pages, int nr);
> > static inline int __page_cache_add_speculative(struct page *page, int count)
> > {
> > #ifdef CONFIG_TINY_RCU
> > - VM_BUG_ON(preemptible())
> > + VM_BUG_ON(preemptible());
> > /*
> > * Preempt must be disabled here - we rely on rcu_read_lock doing
> > * this for us.
> > urezki@pc638:~/data/coding/linux.git$
> >
> > I guess we had some extra patch that fixes a kernel compilation for !SMP
> > case. Will check dev.2020.10.22a.
>
> I guess I am blind today!
>
> And yes, that semicolon is in the corresponding commit in -tip.
>
> So, an important safety tip: When forward porting, start from the
> commits that have been tested rather than from the original series
> of patches. ;-)
>
> Thanx, Paul
Right. Sorry for that. I remember you mentioned about a build error
some time ago. My bad :)

I have updated the: [PATCH 06/16] mm/pagemap: Cleanup PREEMPT_COUNT leftovers
in this series.

--
Vlad Rezki

2020-10-29 21:29:35

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 06/16] mm/pagemap: Cleanup PREEMPT_COUNT leftovers

On Thu, Oct 29, 2020 at 09:57:17PM +0100, Uladzislau Rezki wrote:
> On Thu, Oct 29, 2020 at 05:50:09PM +0100, Uladzislau Rezki (Sony) wrote:
> > From: Thomas Gleixner <[email protected]>
> >
> > CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
> > removed. Cleanup the leftovers before doing so.
> >
> > Signed-off-by: Thomas Gleixner <[email protected]>
> > Cc: Andrew Morton <[email protected]>
> > Cc: [email protected]
> > Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> > ---
> > include/linux/pagemap.h | 4 +---
> > 1 file changed, 1 insertion(+), 3 deletions(-)
> >
> > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> > index c77b7c31b2e4..cbfbe2bcca75 100644
> > --- a/include/linux/pagemap.h
> > +++ b/include/linux/pagemap.h
> > @@ -204,9 +204,7 @@ void release_pages(struct page **pages, int nr);
> > static inline int __page_cache_add_speculative(struct page *page, int count)
> > {
> > #ifdef CONFIG_TINY_RCU
> > -# ifdef CONFIG_PREEMPT_COUNT
> > - VM_BUG_ON(!in_atomic() && !irqs_disabled());
> > -# endif
> > + VM_BUG_ON(preemptible())
> > /*
> > * Preempt must be disabled here - we rely on rcu_read_lock doing
> > * this for us.
> > --
> > 2.20.1
> >
> Hello, Paul.
>
> Sorry for a small mistake, it was fixed by you before, whereas i took an
> old version of the patch that is question. Please use below one instead of
> posted one:

We have all been there and done that! ;-)

I will give this update a spin and see what happens.

Thanx, Paul

> Author: Thomas Gleixner <[email protected]>
> Date: Mon Sep 14 19:25:00 2020 +0200
>
> mm/pagemap: Cleanup PREEMPT_COUNT leftovers
>
> CONFIG_PREEMPT_COUNT is now unconditionally enabled and will be
> removed. Cleanup the leftovers before doing so.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: [email protected]
> [ paulmck: Fix !SMP build error per kernel test robot feedback. ]
> Signed-off-by: Paul E. McKenney <[email protected]>
>
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 7de11dcd534d..b3d9d9217ea0 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -168,9 +168,7 @@ void release_pages(struct page **pages, int nr);
> static inline int __page_cache_add_speculative(struct page *page, int count)
> {
> #ifdef CONFIG_TINY_RCU
> -# ifdef CONFIG_PREEMPT_COUNT
> - VM_BUG_ON(!in_atomic() && !irqs_disabled());
> -# endif
> + VM_BUG_ON(preemptible());
> /*
> * Preempt must be disabled here - we rely on rcu_read_lock doing
> * this for us.
>
> --
> Vlad Rezki

2020-11-03 15:50:27

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> The current memmory-allocation interface presents to following
> difficulties that this patch is designed to overcome:
>
> a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
> complain about violation("BUG: Invalid wait context") of the
> nesting rules. It does the raw_spinlock vs. spinlock nesting
> checks, i.e. it is not legal to acquire a spinlock_t while
> holding a raw_spinlock_t.
>
> Internally the kfree_rcu() uses raw_spinlock_t whereas the
> "page allocator" internally deals with spinlock_t to access
> to its zones. The code also can be broken from higher level
> of view:
> <snip>
> raw_spin_lock(&some_lock);
> kfree_rcu(some_pointer, some_field_offset);
> <snip>
>
> b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
> is converted into sleepable variant. Invoking the page allocator from
> atomic contexts leads to "BUG: scheduling while atomic".
>
> c) call_rcu() is invoked from raw atomic context and kfree_rcu()
> and kvfree_rcu() are expected to be called from atomic raw context
> as well.
>
> Move out a page allocation from contexts which trigger kvfree_rcu()
> function to the separate worker. When a k[v]free_rcu() per-cpu page
> cache is empty a fallback mechanism is used and a special job is
> scheduled to refill the per-cpu cache.

Looks good, still reviewing here. BTW just for my education, I was wondering
about Thomas's email:
https://lkml.org/lkml/2020/8/11/939

If slab allocations in pure raw-atomic context on RT is not allowed or
recommended, should kfree_rcu() be allowed?
slab can have same issue right? If per-cpu cache is drained, it has to
allocate page from buddy allocator and there's no GFP flag to tell it about
context where alloc is happening from.

Or are we saying that we want to support kfree on RT from raw atomic atomic
context, even though kmalloc is not supported? I hate to bring up this
elephant in the room, but since I am a part of the people maintaining this
code, I believe I would rather set some rules than supporting unsupported
usages. :-\ (Once I know what is supported and what isn't that is). If indeed
raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
put a giant warning than supporting it :-(.

But I don't stand in the way of this patch if all agree on it. thanks,

- Joel


> Link: https://lore.kernel.org/lkml/[email protected]/
> Fixes: 3042f83f19be ("rcu: Support reclaim for head-less object")
> Reported-by: Sebastian Andrzej Siewior <[email protected]>
> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> ---
> kernel/rcu/tree.c | 109 ++++++++++++++++++++++++++++------------------
> 1 file changed, 66 insertions(+), 43 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 06895ef85d69..f2da2a1cc716 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
> * per-CPU. Object size is equal to one page. This value
> * can be changed at boot time.
> */
> -static int rcu_min_cached_objs = 2;
> +static int rcu_min_cached_objs = 5;
> module_param(rcu_min_cached_objs, int, 0444);
>
> /* Retrieve RCU kthreads priority for rcutorture */
> @@ -3084,6 +3084,9 @@ struct kfree_rcu_cpu_work {
> * In order to save some per-cpu space the list is singular.
> * Even though it is lockless an access has to be protected by the
> * per-cpu lock.
> + * @page_cache_work: A work to refill the cache when it is empty
> + * @work_in_progress: Indicates that page_cache_work is running
> + * @hrtimer: A hrtimer for scheduling a page_cache_work
> * @nr_bkv_objs: number of allocated objects at @bkvcache.
> *
> * This is a per-CPU structure. The reason that it is not included in
> @@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
> bool monitor_todo;
> bool initialized;
> int count;
> +
> + struct work_struct page_cache_work;
> + atomic_t work_in_progress;
> + struct hrtimer hrtimer;
> +
> struct llist_head bkvcache;
> int nr_bkv_objs;
> };
> @@ -3217,10 +3225,10 @@ static void kfree_rcu_work(struct work_struct *work)
> }
> rcu_lock_release(&rcu_callback_map);
>
> - krcp = krc_this_cpu_lock(&flags);
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> if (put_cached_bnode(krcp, bkvhead[i]))
> bkvhead[i] = NULL;
> - krc_this_cpu_unlock(krcp, flags);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
>
> if (bkvhead[i])
> free_page((unsigned long) bkvhead[i]);
> @@ -3347,6 +3355,57 @@ static void kfree_rcu_monitor(struct work_struct *work)
> raw_spin_unlock_irqrestore(&krcp->lock, flags);
> }
>
> +static enum hrtimer_restart
> +schedule_page_work_fn(struct hrtimer *t)
> +{
> + struct kfree_rcu_cpu *krcp =
> + container_of(t, struct kfree_rcu_cpu, hrtimer);
> +
> + queue_work(system_highpri_wq, &krcp->page_cache_work);
> + return HRTIMER_NORESTART;
> +}
> +
> +static void fill_page_cache_func(struct work_struct *work)
> +{
> + struct kvfree_rcu_bulk_data *bnode;
> + struct kfree_rcu_cpu *krcp =
> + container_of(work, struct kfree_rcu_cpu,
> + page_cache_work);
> + unsigned long flags;
> + bool pushed;
> + int i;
> +
> + for (i = 0; i < rcu_min_cached_objs; i++) {
> + bnode = (struct kvfree_rcu_bulk_data *)
> + __get_free_page(GFP_KERNEL | __GFP_NOWARN);
> +
> + if (bnode) {
> + raw_spin_lock_irqsave(&krcp->lock, flags);
> + pushed = put_cached_bnode(krcp, bnode);
> + raw_spin_unlock_irqrestore(&krcp->lock, flags);
> +
> + if (!pushed) {
> + free_page((unsigned long) bnode);
> + break;
> + }
> + }
> + }
> +
> + atomic_set(&krcp->work_in_progress, 0);
> +}
> +
> +static void
> +run_page_cache_worker(struct kfree_rcu_cpu *krcp)
> +{
> + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
> + !atomic_xchg(&krcp->work_in_progress, 1)) {
> + hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
> + HRTIMER_MODE_REL);
> + krcp->hrtimer.function = schedule_page_work_fn;
> + hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
> + }
> +}
> +
> static inline bool
> kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
> {
> @@ -3363,32 +3422,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
> if (!krcp->bkvhead[idx] ||
> krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> bnode = get_cached_bnode(krcp);
> - if (!bnode) {
> - /*
> - * To keep this path working on raw non-preemptible
> - * sections, prevent the optional entry into the
> - * allocator as it uses sleeping locks. In fact, even
> - * if the caller of kfree_rcu() is preemptible, this
> - * path still is not, as krcp->lock is a raw spinlock.
> - * With additional page pre-allocation in the works,
> - * hitting this return is going to be much less likely.
> - */
> - if (IS_ENABLED(CONFIG_PREEMPT_RT))
> - return false;
> -
> - /*
> - * NOTE: For one argument of kvfree_rcu() we can
> - * drop the lock and get the page in sleepable
> - * context. That would allow to maintain an array
> - * for the CONFIG_PREEMPT_RT as well if no cached
> - * pages are available.
> - */
> - bnode = (struct kvfree_rcu_bulk_data *)
> - __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> - }
> -
> /* Switch to emergency path. */
> - if (unlikely(!bnode))
> + if (!bnode)
> return false;
>
> /* Initialize the new block. */
> @@ -3452,12 +3487,10 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> goto unlock_return;
> }
>
> - /*
> - * Under high memory pressure GFP_NOWAIT can fail,
> - * in that case the emergency path is maintained.
> - */
> success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
> if (!success) {
> + run_page_cache_worker(krcp);
> +
> if (head == NULL)
> // Inline if kvfree_rcu(one_arg) call.
> goto unlock_return;
> @@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
>
> for_each_possible_cpu(cpu) {
> struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> - struct kvfree_rcu_bulk_data *bnode;
>
> for (i = 0; i < KFREE_N_BATCHES; i++) {
> INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> krcp->krw_arr[i].krcp = krcp;
> }
>
> - for (i = 0; i < rcu_min_cached_objs; i++) {
> - bnode = (struct kvfree_rcu_bulk_data *)
> - __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> -
> - if (bnode)
> - put_cached_bnode(krcp, bnode);
> - else
> - pr_err("Failed to preallocate for %d CPU!\n", cpu);
> - }
> -
> INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> + INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
> krcp->initialized = true;
> }
> if (register_shrinker(&kfree_rcu_shrinker))
> --
> 2.20.1
>

2020-11-03 16:36:24

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

On Tue, Nov 03, 2020 at 10:47:23AM -0500, Joel Fernandes wrote:
> On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > The current memmory-allocation interface presents to following
> > difficulties that this patch is designed to overcome:
> >
> > a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
> > complain about violation("BUG: Invalid wait context") of the
> > nesting rules. It does the raw_spinlock vs. spinlock nesting
> > checks, i.e. it is not legal to acquire a spinlock_t while
> > holding a raw_spinlock_t.
> >
> > Internally the kfree_rcu() uses raw_spinlock_t whereas the
> > "page allocator" internally deals with spinlock_t to access
> > to its zones. The code also can be broken from higher level
> > of view:
> > <snip>
> > raw_spin_lock(&some_lock);
> > kfree_rcu(some_pointer, some_field_offset);
> > <snip>
> >
> > b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
> > is converted into sleepable variant. Invoking the page allocator from
> > atomic contexts leads to "BUG: scheduling while atomic".
> >
> > c) call_rcu() is invoked from raw atomic context and kfree_rcu()
> > and kvfree_rcu() are expected to be called from atomic raw context
> > as well.
> >
> > Move out a page allocation from contexts which trigger kvfree_rcu()
> > function to the separate worker. When a k[v]free_rcu() per-cpu page
> > cache is empty a fallback mechanism is used and a special job is
> > scheduled to refill the per-cpu cache.
>
> Looks good, still reviewing here. BTW just for my education, I was wondering
> about Thomas's email:
> https://lkml.org/lkml/2020/8/11/939
>
> If slab allocations in pure raw-atomic context on RT is not allowed or
> recommended, should kfree_rcu() be allowed?
>
Thanks for reviewing, Joel :)

The decision was made that we need to support kfree_rcu() from "real atomic contexts",
to align with how it used to be before. We can go and just convert our local locks
to the spinlock_t variant but that was not Paul goal, it can be that some users need
kfree_rcu() for raw atomics.

>
> slab can have same issue right? If per-cpu cache is drained, it has to
> allocate page from buddy allocator and there's no GFP flag to tell it about
> context where alloc is happening from.
>
Sounds like that. Apart of that, it might turn out soon that we or somebody
else will rise a question one more time about something GFP_RAW or GFP_NOLOCKS.
So who knows..

>
> Or are we saying that we want to support kfree on RT from raw atomic atomic
> context, even though kmalloc is not supported? I hate to bring up this
> elephant in the room, but since I am a part of the people maintaining this
> code, I believe I would rather set some rules than supporting unsupported
> usages. :-\ (Once I know what is supported and what isn't that is). If indeed
> raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
> put a giant warning than supporting it :-(.
>
We discussed it several times, the conclusion was that we need to support
kfree_rcu() from raw contexts. At least that was a clear signal from Paul
to me. I think, if we obtain the preemtable(), so it becomes versatile, we
can drop the patch that is in question later on in the future.

--
Vlad Rezki

2020-11-03 17:57:45

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> The current memmory-allocation interface presents to following
> difficulties that this patch is designed to overcome
[...]
> ---
> kernel/rcu/tree.c | 109 ++++++++++++++++++++++++++++------------------
> 1 file changed, 66 insertions(+), 43 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 06895ef85d69..f2da2a1cc716 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
> * per-CPU. Object size is equal to one page. This value
> * can be changed at boot time.
> */
> -static int rcu_min_cached_objs = 2;
> +static int rcu_min_cached_objs = 5;
> module_param(rcu_min_cached_objs, int, 0444);
>
> /* Retrieve RCU kthreads priority for rcutorture */
> @@ -3084,6 +3084,9 @@ struct kfree_rcu_cpu_work {
> * In order to save some per-cpu space the list is singular.
> * Even though it is lockless an access has to be protected by the
> * per-cpu lock.
> + * @page_cache_work: A work to refill the cache when it is empty
> + * @work_in_progress: Indicates that page_cache_work is running
> + * @hrtimer: A hrtimer for scheduling a page_cache_work
> * @nr_bkv_objs: number of allocated objects at @bkvcache.
> *
> * This is a per-CPU structure. The reason that it is not included in
> @@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
> bool monitor_todo;
> bool initialized;
> int count;
> +
> + struct work_struct page_cache_work;
> + atomic_t work_in_progress;

Does it need to be atomic? run_page_cache_work() is only called under a lock.
You can use xchg() there. And when you do the atomic_set, you can use
WRITE_ONCE as it is a data-race.

> @@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
>
> for_each_possible_cpu(cpu) {
> struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> - struct kvfree_rcu_bulk_data *bnode;
>
> for (i = 0; i < KFREE_N_BATCHES; i++) {
> INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> krcp->krw_arr[i].krcp = krcp;
> }
>
> - for (i = 0; i < rcu_min_cached_objs; i++) {
> - bnode = (struct kvfree_rcu_bulk_data *)
> - __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> -
> - if (bnode)
> - put_cached_bnode(krcp, bnode);
> - else
> - pr_err("Failed to preallocate for %d CPU!\n", cpu);
> - }
> -
> INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> + INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
> krcp->initialized = true;

During initialization, is it not better to still pre-allocate? That way you
don't have to wait to get into a situation where you need to initially
allocate.

AFAICS after the above line deletions, the bulk pages are initially empty.

thanks,

- Joel


> }
> if (register_shrinker(&kfree_rcu_shrinker))
> --
> 2.20.1
>

2020-11-03 18:04:59

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible

Hi Vlad,
Few minor nits:

On Thu, Oct 29, 2020 at 05:50:18PM +0100, Uladzislau Rezki (Sony) wrote:
> Given that CONFIG_PREEMPT_COUNT is unconditionally enabled by the
> earlier commits in this series, the preemptible() macro now properly
> detects preempt-disable code regions even in kernels built with
> CONFIG_PREEMPT_NONE.
>
> This commit therefore uses preemptible() to determine whether allocation
> is possible at all for double-argument kvfree_rcu(). If !preemptible(),
> then allocation is not possible, and kvfree_rcu() falls back to using
> the less cache-friendly rcu_head approach. Even when preemptible(),
> the caller might be involved in reclaim, so the GFP_ flags used by
> double-argument kvfree_rcu() must avoid invoking reclaim processing.
>
> Note that single-argument kvfree_rcu() must be invoked in sleepable
> contexts, and that its fallback is the relatively high latency
> synchronize_rcu(). Single-argument kvfree_rcu() therefore uses
> GFP_KERNEL|__GFP_RETRY_MAYFAIL to allow limited sleeping within the
> memory allocator.
>
> [ paulmck: Add add_ptr_to_bulk_krc_lock header comment per Michal Hocko. ]
> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
> ---
> kernel/rcu/tree.c | 48 ++++++++++++++++++++++++++++++-----------------
> 1 file changed, 31 insertions(+), 17 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index f2da2a1cc716..3f9b016a44dc 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3406,37 +3406,55 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
> }
> }
>
> +// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
> +// state specified by flags. If can_sleep is true, the caller must
> +// be schedulable and not be holding any locks or mutexes that might be
> +// acquired by the memory allocator or anything that it might invoke.
> +// If !can_sleep, then if !preemptible() no allocation will be undertaken,
> +// otherwise the allocation will use GFP_ATOMIC to avoid the remainder of
> +// the aforementioned deadlock possibilities. Returns true if ptr was
> +// successfully recorded, else the caller must use a fallback.
> static inline bool
> -kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
> +add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
> + unsigned long *flags, void *ptr, bool can_sleep)
> {
> struct kvfree_rcu_bulk_data *bnode;
> + bool can_alloc_page = preemptible();
> + gfp_t gfp = (can_sleep ? GFP_KERNEL | __GFP_RETRY_MAYFAIL :
> + GFP_ATOMIC) | __GFP_NOWARN;
> int idx;
>
> - if (unlikely(!krcp->initialized))
> + *krcp = krc_this_cpu_lock(flags);
> + if (unlikely(!(*krcp)->initialized))
> return false;
>
> - lockdep_assert_held(&krcp->lock);
> idx = !!is_vmalloc_addr(ptr);
>
> /* Check if a new block is required. */

Maybe convert this comment also to //... like the new ones you added (and the
ones below).

> - if (!krcp->bkvhead[idx] ||
> - krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> - bnode = get_cached_bnode(krcp);
> - /* Switch to emergency path. */
> + if (!(*krcp)->bkvhead[idx] ||
> + (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> + bnode = get_cached_bnode(*krcp);
> + if (!bnode && can_alloc_page) {

I think you can directly put preemptible() here with a comment saying
allocate only if preemptible and get rid of can_alloc_page.

> + krc_this_cpu_unlock(*krcp, *flags);
> + bnode = (struct kvfree_rcu_bulk_data *)
> + __get_free_page(gfp);
> + *krcp = krc_this_cpu_lock(flags);
> + }
> +

I think the "Switch to emergency path" comment should stay here before the
if().

thanks,

- Joel

> if (!bnode)
> return false;
>
> /* Initialize the new block. */
> bnode->nr_records = 0;
> - bnode->next = krcp->bkvhead[idx];
> + bnode->next = (*krcp)->bkvhead[idx];
>
> /* Attach it to the head. */
> - krcp->bkvhead[idx] = bnode;
> + (*krcp)->bkvhead[idx] = bnode;
> }
>
> /* Finally insert. */
> - krcp->bkvhead[idx]->records
> - [krcp->bkvhead[idx]->nr_records++] = ptr;
> + (*krcp)->bkvhead[idx]->records
> + [(*krcp)->bkvhead[idx]->nr_records++] = ptr;
>
> return true;
> }
> @@ -3474,20 +3492,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
> ptr = (unsigned long *) func;
> }
>
> - krcp = krc_this_cpu_lock(&flags);
> -
> // Queue the object but don't yet schedule the batch.
> if (debug_rcu_head_queue(ptr)) {
> // Probable double kfree_rcu(), just leak.
> WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
> __func__, head);
>
> - // Mark as success and leave.
> - success = true;
> - goto unlock_return;
> + return;
> }
>
> - success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
> + success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
> if (!success) {
> run_page_cache_worker(krcp);
>
> --
> 2.20.1
>

2020-11-03 19:20:17

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

On Tue, Nov 03, 2020 at 05:33:50PM +0100, Uladzislau Rezki wrote:
> On Tue, Nov 03, 2020 at 10:47:23AM -0500, Joel Fernandes wrote:
> > On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > > The current memmory-allocation interface presents to following
> > > difficulties that this patch is designed to overcome:
> > >
> > > a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
> > > complain about violation("BUG: Invalid wait context") of the
> > > nesting rules. It does the raw_spinlock vs. spinlock nesting
> > > checks, i.e. it is not legal to acquire a spinlock_t while
> > > holding a raw_spinlock_t.
> > >
> > > Internally the kfree_rcu() uses raw_spinlock_t whereas the
> > > "page allocator" internally deals with spinlock_t to access
> > > to its zones. The code also can be broken from higher level
> > > of view:
> > > <snip>
> > > raw_spin_lock(&some_lock);
> > > kfree_rcu(some_pointer, some_field_offset);
> > > <snip>
> > >
> > > b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
> > > is converted into sleepable variant. Invoking the page allocator from
> > > atomic contexts leads to "BUG: scheduling while atomic".
> > >
> > > c) call_rcu() is invoked from raw atomic context and kfree_rcu()
> > > and kvfree_rcu() are expected to be called from atomic raw context
> > > as well.
> > >
> > > Move out a page allocation from contexts which trigger kvfree_rcu()
> > > function to the separate worker. When a k[v]free_rcu() per-cpu page
> > > cache is empty a fallback mechanism is used and a special job is
> > > scheduled to refill the per-cpu cache.
> >
> > Looks good, still reviewing here. BTW just for my education, I was wondering
> > about Thomas's email:
> > https://lkml.org/lkml/2020/8/11/939
> >
> > If slab allocations in pure raw-atomic context on RT is not allowed or
> > recommended, should kfree_rcu() be allowed?
> >
> Thanks for reviewing, Joel :)
>
> The decision was made that we need to support kfree_rcu() from "real atomic contexts",
> to align with how it used to be before. We can go and just convert our local locks
> to the spinlock_t variant but that was not Paul goal, it can be that some users need
> kfree_rcu() for raw atomics.

People invoke call_rcu() from raw atomics, and so we should provide
the same for kfree_rcu(). Yes, people could work around a raw-atomic
prohibition, but such prohibitions incur constant costs over time in
terms of development effort, increased bug rate, and increased complexity.
Yes, this does increase all of those for RCU, but the relative increase
is negligible, RCU being what it is.

> > slab can have same issue right? If per-cpu cache is drained, it has to
> > allocate page from buddy allocator and there's no GFP flag to tell it about
> > context where alloc is happening from.
> >
> Sounds like that. Apart of that, it might turn out soon that we or somebody
> else will rise a question one more time about something GFP_RAW or GFP_NOLOCKS.
> So who knows..

I would prefer that slab provide some way of dealing with raw atomic
context, but the maintainers are thus far unconvinced.

> > Or are we saying that we want to support kfree on RT from raw atomic atomic
> > context, even though kmalloc is not supported? I hate to bring up this
> > elephant in the room, but since I am a part of the people maintaining this
> > code, I believe I would rather set some rules than supporting unsupported
> > usages. :-\ (Once I know what is supported and what isn't that is). If indeed
> > raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
> > put a giant warning than supporting it :-(.
> >
> We discussed it several times, the conclusion was that we need to support
> kfree_rcu() from raw contexts. At least that was a clear signal from Paul
> to me. I think, if we obtain the preemtable(), so it becomes versatile, we
> can drop the patch that is in question later on in the future.

Given a universally meaningful preemptible(), we could directly call
the allocator in some cases. It might (or might not) still make sense
to defer the allocation when preemptible() indicated that a direct call
to the allocator was unsafe.

Thanx, Paul

2020-11-04 11:42:11

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible

Hello, Joel.

>
> On Thu, Oct 29, 2020 at 05:50:18PM +0100, Uladzislau Rezki (Sony) wrote:
> > Given that CONFIG_PREEMPT_COUNT is unconditionally enabled by the
> > earlier commits in this series, the preemptible() macro now properly
> > detects preempt-disable code regions even in kernels built with
> > CONFIG_PREEMPT_NONE.
> >
> > This commit therefore uses preemptible() to determine whether allocation
> > is possible at all for double-argument kvfree_rcu(). If !preemptible(),
> > then allocation is not possible, and kvfree_rcu() falls back to using
> > the less cache-friendly rcu_head approach. Even when preemptible(),
> > the caller might be involved in reclaim, so the GFP_ flags used by
> > double-argument kvfree_rcu() must avoid invoking reclaim processing.
> >
> > Note that single-argument kvfree_rcu() must be invoked in sleepable
> > contexts, and that its fallback is the relatively high latency
> > synchronize_rcu(). Single-argument kvfree_rcu() therefore uses
> > GFP_KERNEL|__GFP_RETRY_MAYFAIL to allow limited sleeping within the
> > memory allocator.
> >
> > [ paulmck: Add add_ptr_to_bulk_krc_lock header comment per Michal Hocko. ]
> > Signed-off-by: Uladzislau Rezki (Sony) <[email protected]>
> > Signed-off-by: Paul E. McKenney <[email protected]>
> > ---
> > kernel/rcu/tree.c | 48 ++++++++++++++++++++++++++++++-----------------
> > 1 file changed, 31 insertions(+), 17 deletions(-)
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index f2da2a1cc716..3f9b016a44dc 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -3406,37 +3406,55 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
> > }
> > }
> >
> > +// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
> > +// state specified by flags. If can_sleep is true, the caller must
> > +// be schedulable and not be holding any locks or mutexes that might be
> > +// acquired by the memory allocator or anything that it might invoke.
> > +// If !can_sleep, then if !preemptible() no allocation will be undertaken,
> > +// otherwise the allocation will use GFP_ATOMIC to avoid the remainder of
> > +// the aforementioned deadlock possibilities. Returns true if ptr was
> > +// successfully recorded, else the caller must use a fallback.
> > static inline bool
> > -kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
> > +add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
> > + unsigned long *flags, void *ptr, bool can_sleep)
> > {
> > struct kvfree_rcu_bulk_data *bnode;
> > + bool can_alloc_page = preemptible();
> > + gfp_t gfp = (can_sleep ? GFP_KERNEL | __GFP_RETRY_MAYFAIL :
> > + GFP_ATOMIC) | __GFP_NOWARN;
> > int idx;
> >
> > - if (unlikely(!krcp->initialized))
> > + *krcp = krc_this_cpu_lock(flags);
> > + if (unlikely(!(*krcp)->initialized))
> > return false;
> >
> > - lockdep_assert_held(&krcp->lock);
> > idx = !!is_vmalloc_addr(ptr);
> >
> > /* Check if a new block is required. */
>
> Maybe convert this comment also to //... like the new ones you added (and the
> ones below).
>
No, problem. I can convert it.

> > - if (!krcp->bkvhead[idx] ||
> > - krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> > - bnode = get_cached_bnode(krcp);
> > - /* Switch to emergency path. */
> > + if (!(*krcp)->bkvhead[idx] ||
> > + (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> > + bnode = get_cached_bnode(*krcp);
> > + if (!bnode && can_alloc_page) {
>
> I think you can directly put preemptible() here with a comment saying
> allocate only if preemptible and get rid of can_alloc_page.
>
Not really. We check preemtable() before acquiring the internal lock,
otherwise it will always return "false". Thus, it is checked on the
entry in the beginning.

> > + krc_this_cpu_unlock(*krcp, *flags);
> > + bnode = (struct kvfree_rcu_bulk_data *)
> > + __get_free_page(gfp);
> > + *krcp = krc_this_cpu_lock(flags);
> > + }
> > +
>
> I think the "Switch to emergency path" comment should stay here before the
> if().
>
Right. We need to keep it. Will take it back.

--
Vlad Rezki

2020-11-04 12:15:43

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

On Tue, Nov 03, 2020 at 12:54:22PM -0500, Joel Fernandes wrote:
> On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > The current memmory-allocation interface presents to following
> > difficulties that this patch is designed to overcome
> [...]
> > ---
> > kernel/rcu/tree.c | 109 ++++++++++++++++++++++++++++------------------
> > 1 file changed, 66 insertions(+), 43 deletions(-)
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 06895ef85d69..f2da2a1cc716 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
> > * per-CPU. Object size is equal to one page. This value
> > * can be changed at boot time.
> > */
> > -static int rcu_min_cached_objs = 2;
> > +static int rcu_min_cached_objs = 5;
> > module_param(rcu_min_cached_objs, int, 0444);
> >
> > /* Retrieve RCU kthreads priority for rcutorture */
> > @@ -3084,6 +3084,9 @@ struct kfree_rcu_cpu_work {
> > * In order to save some per-cpu space the list is singular.
> > * Even though it is lockless an access has to be protected by the
> > * per-cpu lock.
> > + * @page_cache_work: A work to refill the cache when it is empty
> > + * @work_in_progress: Indicates that page_cache_work is running
> > + * @hrtimer: A hrtimer for scheduling a page_cache_work
> > * @nr_bkv_objs: number of allocated objects at @bkvcache.
> > *
> > * This is a per-CPU structure. The reason that it is not included in
> > @@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
> > bool monitor_todo;
> > bool initialized;
> > int count;
> > +
> > + struct work_struct page_cache_work;
> > + atomic_t work_in_progress;
>
> Does it need to be atomic? run_page_cache_work() is only called under a lock.
> You can use xchg() there. And when you do the atomic_set, you can use
> WRITE_ONCE as it is a data-race.
>
We can use xchg together with *_ONCE() macro. Could you please clarify what
is your concern about using atomic_t? Both xchg() and atomic_xchg() guarantee
atamarity. Same as WRITE_ONCE() or atomic_set().

> > @@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
> >
> > for_each_possible_cpu(cpu) {
> > struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > - struct kvfree_rcu_bulk_data *bnode;
> >
> > for (i = 0; i < KFREE_N_BATCHES; i++) {
> > INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> > krcp->krw_arr[i].krcp = krcp;
> > }
> >
> > - for (i = 0; i < rcu_min_cached_objs; i++) {
> > - bnode = (struct kvfree_rcu_bulk_data *)
> > - __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > -
> > - if (bnode)
> > - put_cached_bnode(krcp, bnode);
> > - else
> > - pr_err("Failed to preallocate for %d CPU!\n", cpu);
> > - }
> > -
> > INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> > + INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
> > krcp->initialized = true;
>
> During initialization, is it not better to still pre-allocate? That way you
> don't have to wait to get into a situation where you need to initially
> allocate.
>
Since we have a worker that does it when a cache is empty there is no
a high need in doing it during initialization phase. If we can reduce
an amount of code it is always good :)

Thanks, Joel.

--
Vlad Rezki

2020-11-04 12:37:52

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

On Tue, Nov 03, 2020 at 11:18:22AM -0800, Paul E. McKenney wrote:
> On Tue, Nov 03, 2020 at 05:33:50PM +0100, Uladzislau Rezki wrote:
> > On Tue, Nov 03, 2020 at 10:47:23AM -0500, Joel Fernandes wrote:
> > > On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > The current memmory-allocation interface presents to following
> > > > difficulties that this patch is designed to overcome:
> > > >
> > > > a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
> > > > complain about violation("BUG: Invalid wait context") of the
> > > > nesting rules. It does the raw_spinlock vs. spinlock nesting
> > > > checks, i.e. it is not legal to acquire a spinlock_t while
> > > > holding a raw_spinlock_t.
> > > >
> > > > Internally the kfree_rcu() uses raw_spinlock_t whereas the
> > > > "page allocator" internally deals with spinlock_t to access
> > > > to its zones. The code also can be broken from higher level
> > > > of view:
> > > > <snip>
> > > > raw_spin_lock(&some_lock);
> > > > kfree_rcu(some_pointer, some_field_offset);
> > > > <snip>
> > > >
> > > > b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
> > > > is converted into sleepable variant. Invoking the page allocator from
> > > > atomic contexts leads to "BUG: scheduling while atomic".
> > > >
> > > > c) call_rcu() is invoked from raw atomic context and kfree_rcu()
> > > > and kvfree_rcu() are expected to be called from atomic raw context
> > > > as well.
> > > >
> > > > Move out a page allocation from contexts which trigger kvfree_rcu()
> > > > function to the separate worker. When a k[v]free_rcu() per-cpu page
> > > > cache is empty a fallback mechanism is used and a special job is
> > > > scheduled to refill the per-cpu cache.
> > >
> > > Looks good, still reviewing here. BTW just for my education, I was wondering
> > > about Thomas's email:
> > > https://lkml.org/lkml/2020/8/11/939
> > >
> > > If slab allocations in pure raw-atomic context on RT is not allowed or
> > > recommended, should kfree_rcu() be allowed?
> > >
> > Thanks for reviewing, Joel :)
> >
> > The decision was made that we need to support kfree_rcu() from "real atomic contexts",
> > to align with how it used to be before. We can go and just convert our local locks
> > to the spinlock_t variant but that was not Paul goal, it can be that some users need
> > kfree_rcu() for raw atomics.
>
> People invoke call_rcu() from raw atomics, and so we should provide
> the same for kfree_rcu(). Yes, people could work around a raw-atomic
> prohibition, but such prohibitions incur constant costs over time in
> terms of development effort, increased bug rate, and increased complexity.
> Yes, this does increase all of those for RCU, but the relative increase
> is negligible, RCU being what it is.
>
I see your point.

> > > slab can have same issue right? If per-cpu cache is drained, it has to
> > > allocate page from buddy allocator and there's no GFP flag to tell it about
> > > context where alloc is happening from.
> > >
> > Sounds like that. Apart of that, it might turn out soon that we or somebody
> > else will rise a question one more time about something GFP_RAW or GFP_NOLOCKS.
> > So who knows..
>
> I would prefer that slab provide some way of dealing with raw atomic
> context, but the maintainers are thus far unconvinced.
>
I think, when preempt_rt is fully integrated to the kernel, we might get
new users with such demand. So, it is not a closed topic so far, IMHO.

> > > Or are we saying that we want to support kfree on RT from raw atomic atomic
> > > context, even though kmalloc is not supported? I hate to bring up this
> > > elephant in the room, but since I am a part of the people maintaining this
> > > code, I believe I would rather set some rules than supporting unsupported
> > > usages. :-\ (Once I know what is supported and what isn't that is). If indeed
> > > raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
> > > put a giant warning than supporting it :-(.
> > >
> > We discussed it several times, the conclusion was that we need to support
> > kfree_rcu() from raw contexts. At least that was a clear signal from Paul
> > to me. I think, if we obtain the preemtable(), so it becomes versatile, we
> > can drop the patch that is in question later on in the future.
>
> Given a universally meaningful preemptible(), we could directly call
> the allocator in some cases. It might (or might not) still make sense
> to defer the allocation when preemptible() indicated that a direct call
> to the allocator was unsafe.
>
I do not have a strong opinion here. Giving the fact that maintaining of
such "deferring" is not considered as a big effort, i think, we can live
with it.

--
Vlad Rezki

2020-11-04 14:13:46

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

On Wed, Nov 04, 2020 at 01:35:53PM +0100, Uladzislau Rezki wrote:
> On Tue, Nov 03, 2020 at 11:18:22AM -0800, Paul E. McKenney wrote:
> > On Tue, Nov 03, 2020 at 05:33:50PM +0100, Uladzislau Rezki wrote:
> > > On Tue, Nov 03, 2020 at 10:47:23AM -0500, Joel Fernandes wrote:
> > > > On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > > The current memmory-allocation interface presents to following
> > > > > difficulties that this patch is designed to overcome:
> > > > >
> > > > > a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
> > > > > complain about violation("BUG: Invalid wait context") of the
> > > > > nesting rules. It does the raw_spinlock vs. spinlock nesting
> > > > > checks, i.e. it is not legal to acquire a spinlock_t while
> > > > > holding a raw_spinlock_t.
> > > > >
> > > > > Internally the kfree_rcu() uses raw_spinlock_t whereas the
> > > > > "page allocator" internally deals with spinlock_t to access
> > > > > to its zones. The code also can be broken from higher level
> > > > > of view:
> > > > > <snip>
> > > > > raw_spin_lock(&some_lock);
> > > > > kfree_rcu(some_pointer, some_field_offset);
> > > > > <snip>
> > > > >
> > > > > b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
> > > > > is converted into sleepable variant. Invoking the page allocator from
> > > > > atomic contexts leads to "BUG: scheduling while atomic".
> > > > >
> > > > > c) call_rcu() is invoked from raw atomic context and kfree_rcu()
> > > > > and kvfree_rcu() are expected to be called from atomic raw context
> > > > > as well.
> > > > >
> > > > > Move out a page allocation from contexts which trigger kvfree_rcu()
> > > > > function to the separate worker. When a k[v]free_rcu() per-cpu page
> > > > > cache is empty a fallback mechanism is used and a special job is
> > > > > scheduled to refill the per-cpu cache.
> > > >
> > > > Looks good, still reviewing here. BTW just for my education, I was wondering
> > > > about Thomas's email:
> > > > https://lkml.org/lkml/2020/8/11/939
> > > >
> > > > If slab allocations in pure raw-atomic context on RT is not allowed or
> > > > recommended, should kfree_rcu() be allowed?
> > > >
> > > Thanks for reviewing, Joel :)
> > >
> > > The decision was made that we need to support kfree_rcu() from "real atomic contexts",
> > > to align with how it used to be before. We can go and just convert our local locks
> > > to the spinlock_t variant but that was not Paul goal, it can be that some users need
> > > kfree_rcu() for raw atomics.
> >
> > People invoke call_rcu() from raw atomics, and so we should provide
> > the same for kfree_rcu(). Yes, people could work around a raw-atomic
> > prohibition, but such prohibitions incur constant costs over time in
> > terms of development effort, increased bug rate, and increased complexity.
> > Yes, this does increase all of those for RCU, but the relative increase
> > is negligible, RCU being what it is.
> >
> I see your point.
>
> > > > slab can have same issue right? If per-cpu cache is drained, it has to
> > > > allocate page from buddy allocator and there's no GFP flag to tell it about
> > > > context where alloc is happening from.
> > > >
> > > Sounds like that. Apart of that, it might turn out soon that we or somebody
> > > else will rise a question one more time about something GFP_RAW or GFP_NOLOCKS.
> > > So who knows..
> >
> > I would prefer that slab provide some way of dealing with raw atomic
> > context, but the maintainers are thus far unconvinced.
> >
> I think, when preempt_rt is fully integrated to the kernel, we might get
> new users with such demand. So, it is not a closed topic so far, IMHO.

Agreed! ;-)

> > > > Or are we saying that we want to support kfree on RT from raw atomic atomic
> > > > context, even though kmalloc is not supported? I hate to bring up this
> > > > elephant in the room, but since I am a part of the people maintaining this
> > > > code, I believe I would rather set some rules than supporting unsupported
> > > > usages. :-\ (Once I know what is supported and what isn't that is). If indeed
> > > > raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
> > > > put a giant warning than supporting it :-(.
> > > >
> > > We discussed it several times, the conclusion was that we need to support
> > > kfree_rcu() from raw contexts. At least that was a clear signal from Paul
> > > to me. I think, if we obtain the preemtable(), so it becomes versatile, we
> > > can drop the patch that is in question later on in the future.
> >
> > Given a universally meaningful preemptible(), we could directly call
> > the allocator in some cases. It might (or might not) still make sense
> > to defer the allocation when preemptible() indicated that a direct call
> > to the allocator was unsafe.
> >
> I do not have a strong opinion here. Giving the fact that maintaining of
> such "deferring" is not considered as a big effort, i think, we can live
> with it.

And agreed here as well. If this were instead a large body of complex
code, I might feel otherwise. But as it is, why worry?

Thanx, Paul

2020-11-04 14:43:29

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

On Wed, Nov 04, 2020 at 06:12:00AM -0800, Paul E. McKenney wrote:
> On Wed, Nov 04, 2020 at 01:35:53PM +0100, Uladzislau Rezki wrote:
> > On Tue, Nov 03, 2020 at 11:18:22AM -0800, Paul E. McKenney wrote:
> > > On Tue, Nov 03, 2020 at 05:33:50PM +0100, Uladzislau Rezki wrote:
> > > > On Tue, Nov 03, 2020 at 10:47:23AM -0500, Joel Fernandes wrote:
> > > > > On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > > > > > The current memmory-allocation interface presents to following
> > > > > > difficulties that this patch is designed to overcome:
> > > > > >
> > > > > > a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
> > > > > > complain about violation("BUG: Invalid wait context") of the
> > > > > > nesting rules. It does the raw_spinlock vs. spinlock nesting
> > > > > > checks, i.e. it is not legal to acquire a spinlock_t while
> > > > > > holding a raw_spinlock_t.
> > > > > >
> > > > > > Internally the kfree_rcu() uses raw_spinlock_t whereas the
> > > > > > "page allocator" internally deals with spinlock_t to access
> > > > > > to its zones. The code also can be broken from higher level
> > > > > > of view:
> > > > > > <snip>
> > > > > > raw_spin_lock(&some_lock);
> > > > > > kfree_rcu(some_pointer, some_field_offset);
> > > > > > <snip>
> > > > > >
> > > > > > b) If built with CONFIG_PREEMPT_RT. Please note, in that case spinlock_t
> > > > > > is converted into sleepable variant. Invoking the page allocator from
> > > > > > atomic contexts leads to "BUG: scheduling while atomic".
> > > > > >
> > > > > > c) call_rcu() is invoked from raw atomic context and kfree_rcu()
> > > > > > and kvfree_rcu() are expected to be called from atomic raw context
> > > > > > as well.
> > > > > >
> > > > > > Move out a page allocation from contexts which trigger kvfree_rcu()
> > > > > > function to the separate worker. When a k[v]free_rcu() per-cpu page
> > > > > > cache is empty a fallback mechanism is used and a special job is
> > > > > > scheduled to refill the per-cpu cache.
> > > > >
> > > > > Looks good, still reviewing here. BTW just for my education, I was wondering
> > > > > about Thomas's email:
> > > > > https://lkml.org/lkml/2020/8/11/939
> > > > >
> > > > > If slab allocations in pure raw-atomic context on RT is not allowed or
> > > > > recommended, should kfree_rcu() be allowed?
> > > > >
> > > > Thanks for reviewing, Joel :)
> > > >
> > > > The decision was made that we need to support kfree_rcu() from "real atomic contexts",
> > > > to align with how it used to be before. We can go and just convert our local locks
> > > > to the spinlock_t variant but that was not Paul goal, it can be that some users need
> > > > kfree_rcu() for raw atomics.
> > >
> > > People invoke call_rcu() from raw atomics, and so we should provide
> > > the same for kfree_rcu(). Yes, people could work around a raw-atomic
> > > prohibition, but such prohibitions incur constant costs over time in
> > > terms of development effort, increased bug rate, and increased complexity.
> > > Yes, this does increase all of those for RCU, but the relative increase
> > > is negligible, RCU being what it is.
> > >
> > I see your point.
> >
> > > > > slab can have same issue right? If per-cpu cache is drained, it has to
> > > > > allocate page from buddy allocator and there's no GFP flag to tell it about
> > > > > context where alloc is happening from.
> > > > >
> > > > Sounds like that. Apart of that, it might turn out soon that we or somebody
> > > > else will rise a question one more time about something GFP_RAW or GFP_NOLOCKS.
> > > > So who knows..
> > >
> > > I would prefer that slab provide some way of dealing with raw atomic
> > > context, but the maintainers are thus far unconvinced.
> > >
> > I think, when preempt_rt is fully integrated to the kernel, we might get
> > new users with such demand. So, it is not a closed topic so far, IMHO.
>
> Agreed! ;-)
>
> > > > > Or are we saying that we want to support kfree on RT from raw atomic atomic
> > > > > context, even though kmalloc is not supported? I hate to bring up this
> > > > > elephant in the room, but since I am a part of the people maintaining this
> > > > > code, I believe I would rather set some rules than supporting unsupported
> > > > > usages. :-\ (Once I know what is supported and what isn't that is). If indeed
> > > > > raw atomic kfree_rcu() is a bogus use case because of -RT, then we ought to
> > > > > put a giant warning than supporting it :-(.
> > > > >
> > > > We discussed it several times, the conclusion was that we need to support
> > > > kfree_rcu() from raw contexts. At least that was a clear signal from Paul
> > > > to me. I think, if we obtain the preemtable(), so it becomes versatile, we
> > > > can drop the patch that is in question later on in the future.
> > >
> > > Given a universally meaningful preemptible(), we could directly call
> > > the allocator in some cases. It might (or might not) still make sense
> > > to defer the allocation when preemptible() indicated that a direct call
> > > to the allocator was unsafe.
> > >
> > I do not have a strong opinion here. Giving the fact that maintaining of
> > such "deferring" is not considered as a big effort, i think, we can live
> > with it.
>
> And agreed here as well. If this were instead a large body of complex
> code, I might feel otherwise. But as it is, why worry?
>
Agreed! I do not consider it as extra complexity.

--
Vlad Rezki

2020-11-04 15:05:52

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

On Wed, Nov 04, 2020 at 01:12:03PM +0100, Uladzislau Rezki wrote:
> On Tue, Nov 03, 2020 at 12:54:22PM -0500, Joel Fernandes wrote:
> > On Thu, Oct 29, 2020 at 05:50:04PM +0100, Uladzislau Rezki (Sony) wrote:
> > > The current memmory-allocation interface presents to following
> > > difficulties that this patch is designed to overcome
> > [...]
> > > ---
> > > kernel/rcu/tree.c | 109 ++++++++++++++++++++++++++++------------------
> > > 1 file changed, 66 insertions(+), 43 deletions(-)
> > >
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index 06895ef85d69..f2da2a1cc716 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
> > > * per-CPU. Object size is equal to one page. This value
> > > * can be changed at boot time.
> > > */
> > > -static int rcu_min_cached_objs = 2;
> > > +static int rcu_min_cached_objs = 5;
> > > module_param(rcu_min_cached_objs, int, 0444);
> > >
> > > /* Retrieve RCU kthreads priority for rcutorture */
> > > @@ -3084,6 +3084,9 @@ struct kfree_rcu_cpu_work {
> > > * In order to save some per-cpu space the list is singular.
> > > * Even though it is lockless an access has to be protected by the
> > > * per-cpu lock.
> > > + * @page_cache_work: A work to refill the cache when it is empty
> > > + * @work_in_progress: Indicates that page_cache_work is running
> > > + * @hrtimer: A hrtimer for scheduling a page_cache_work
> > > * @nr_bkv_objs: number of allocated objects at @bkvcache.
> > > *
> > > * This is a per-CPU structure. The reason that it is not included in
> > > @@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
> > > bool monitor_todo;
> > > bool initialized;
> > > int count;
> > > +
> > > + struct work_struct page_cache_work;
> > > + atomic_t work_in_progress;
> >
> > Does it need to be atomic? run_page_cache_work() is only called under a lock.
> > You can use xchg() there. And when you do the atomic_set, you can use
> > WRITE_ONCE as it is a data-race.
> >
> We can use xchg together with *_ONCE() macro. Could you please clarify what
> is your concern about using atomic_t? Both xchg() and atomic_xchg() guarantee
> atamarity. Same as WRITE_ONCE() or atomic_set().

Right, whether there's lock or not does not matter as xchg() is also
atomic-swap.

atomic_t is a more complex type though, I would directly use int since
atomic_t is not needed here and there's no lost-update issue here. It could
be matter of style as well.

BTW I did think atomic_xchg() adds additional memory barriers
but I could not find that to be the case in the implementation. Is that not
the case? Docs says "atomic_xchg must provide explicit memory barriers around
the operation.".

> > > @@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
> > >
> > > for_each_possible_cpu(cpu) {
> > > struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > > - struct kvfree_rcu_bulk_data *bnode;
> > >
> > > for (i = 0; i < KFREE_N_BATCHES; i++) {
> > > INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> > > krcp->krw_arr[i].krcp = krcp;
> > > }
> > >
> > > - for (i = 0; i < rcu_min_cached_objs; i++) {
> > > - bnode = (struct kvfree_rcu_bulk_data *)
> > > - __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > > -
> > > - if (bnode)
> > > - put_cached_bnode(krcp, bnode);
> > > - else
> > > - pr_err("Failed to preallocate for %d CPU!\n", cpu);
> > > - }
> > > -
> > > INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> > > + INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
> > > krcp->initialized = true;
> >
> > During initialization, is it not better to still pre-allocate? That way you
> > don't have to wait to get into a situation where you need to initially
> > allocate.
> >
> Since we have a worker that does it when a cache is empty there is no
> a high need in doing it during initialization phase. If we can reduce
> an amount of code it is always good :)

I am all for not having more code than needed. But you would hit
synchronize_rcu() slow path immediately on first headless kfree_rcu() right?
That seems like a step back from the current code :)

thanks,

- Joel

2020-11-04 17:30:22

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH 15/16] rcu/tree: Allocate a page when caller is preemptible

On Wed, Nov 04, 2020 at 12:39:31PM +0100, Uladzislau Rezki wrote:
[..]
> > > - if (!krcp->bkvhead[idx] ||
> > > - krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> > > - bnode = get_cached_bnode(krcp);
> > > - /* Switch to emergency path. */
> > > + if (!(*krcp)->bkvhead[idx] ||
> > > + (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
> > > + bnode = get_cached_bnode(*krcp);
> > > + if (!bnode && can_alloc_page) {
> >
> > I think you can directly put preemptible() here with a comment saying
> > allocate only if preemptible and get rid of can_alloc_page.
> >
> Not really. We check preemtable() before acquiring the internal lock,
> otherwise it will always return "false". Thus, it is checked on the
> entry in the beginning.

You are right. Sorry. Sounds good to me.

- Joel

2020-11-04 19:17:58

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH 01/16] rcu/tree: Add a work to allocate pages from regular context

> > > > * This is a per-CPU structure. The reason that it is not included in
> > > > @@ -3100,6 +3103,11 @@ struct kfree_rcu_cpu {
> > > > bool monitor_todo;
> > > > bool initialized;
> > > > int count;
> > > > +
> > > > + struct work_struct page_cache_work;
> > > > + atomic_t work_in_progress;
> > >
> > > Does it need to be atomic? run_page_cache_work() is only called under a lock.
> > > You can use xchg() there. And when you do the atomic_set, you can use
> > > WRITE_ONCE as it is a data-race.
> > >
> > We can use xchg together with *_ONCE() macro. Could you please clarify what
> > is your concern about using atomic_t? Both xchg() and atomic_xchg() guarantee
> > atamarity. Same as WRITE_ONCE() or atomic_set().
>
> Right, whether there's lock or not does not matter as xchg() is also
> atomic-swap.
>
> atomic_t is a more complex type though, I would directly use int since
> atomic_t is not needed here and there's no lost-update issue here. It could
> be matter of style as well.
>
> BTW I did think atomic_xchg() adds additional memory barriers
> but I could not find that to be the case in the implementation. Is that not
> the case? Docs says "atomic_xchg must provide explicit memory barriers around
> the operation.".
>
In most of the systems atmoc_xchg() is same as xchg() and atomic_set()
is same as WRITE_ONCE(). But there are exceptions, for example "parisc"

*** arch/parisc/include/asm/atomic.h:
<snip>
...
#define _atomic_spin_lock_irqsave(l,f) do { \
arch_spinlock_t *s = ATOMIC_HASH(l); \
local_irq_save(f); \
arch_spin_lock(s); \
} while(0)
...
static __inline__ void atomic_set(atomic_t *v, int i)
{
unsigned long flags;
_atomic_spin_lock_irqsave(v, flags);

v->counter = i;

_atomic_spin_unlock_irqrestore(v, flags);
}
<snip>

I will switch to xchg() and WRITE_ONCE(), because of such specific ARCHs.

> > > > @@ -4449,24 +4482,14 @@ static void __init kfree_rcu_batch_init(void)
> > > >
> > > > for_each_possible_cpu(cpu) {
> > > > struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
> > > > - struct kvfree_rcu_bulk_data *bnode;
> > > >
> > > > for (i = 0; i < KFREE_N_BATCHES; i++) {
> > > > INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
> > > > krcp->krw_arr[i].krcp = krcp;
> > > > }
> > > >
> > > > - for (i = 0; i < rcu_min_cached_objs; i++) {
> > > > - bnode = (struct kvfree_rcu_bulk_data *)
> > > > - __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
> > > > -
> > > > - if (bnode)
> > > > - put_cached_bnode(krcp, bnode);
> > > > - else
> > > > - pr_err("Failed to preallocate for %d CPU!\n", cpu);
> > > > - }
> > > > -
> > > > INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
> > > > + INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
> > > > krcp->initialized = true;
> > >
> > > During initialization, is it not better to still pre-allocate? That way you
> > > don't have to wait to get into a situation where you need to initially
> > > allocate.
> > >
> > Since we have a worker that does it when a cache is empty there is no
> > a high need in doing it during initialization phase. If we can reduce
> > an amount of code it is always good :)
>
> I am all for not having more code than needed. But you would hit
> synchronize_rcu() slow path immediately on first headless kfree_rcu() right?
> That seems like a step back from the current code :)
>
As for slow path and hitting the synchronize_rcu() immediately. Yes, a slow
hit "counter" will be increased by 1, the difference between two variants
will be N and N + 1 times. I do not consider N + 1 as a big difference and
impact on performance.

Should we guarantee that a first user does not hit a fallback path that
invokes synchronize_rcu()? If not, i would rather remove redundant code.

Any thoughts here?

Thanks!

--
Vlad Rezki