syzbot is reporting that the same local lock is held when trying to
hold mmap sem from both IRQ enabled context and IRQ context.
Since all callers use bpf_mmap_unlock_get_irq_work() before calling
mmap_read_trylock(), test in_hardirq() at bpf_mmap_unlock_get_irq_work()
in order to make sure that mmap_read_trylock() won't be called from IRQ
context.
asm_exc_page_fault() => exc_page_fault() => handle_page_fault()
=> do_user_addr_fault() => handle_mm_fault() => __handle_mm_fault()
=> handle_pte_fault() => do_pte_missing() => do_anonymous_page()
=> vmf_anon_prepare() => mmap_read_trylock()
=> __mmap_lock_trace_start_locking()
=> __mmap_lock_do_trace_start_locking() => local_lock_acquire()
=> lock_acquire()
sysvec_irq_work() => instr_sysvec_irq_work() => __sysvec_irq_work()
=> irq_work_run() => irq_work_run_list() => irq_work_single()
=> do_bpf_send_signal() => group_send_sig_info() => rcu_read_unlock()
=> rcu_lock_release() => lock_release() => trace_lock_release()
=> perf_trace_lock() => perf_trace_run_bpf_submit() => trace_call_bpf()
=> bpf_prog_run_array() => bpf_prog_run() => __bpf_prog_run()
=> bpf_dispatcher_nop_func() => bpf_prog_e6cf5f9c69743609()
=> __bpf_get_stack() => stack_map_get_build_id_offset()
=> mmap_read_trylock() => __mmap_lock_trace_acquire_returned()
=> __mmap_lock_do_trace_acquire_returned() => local_lock_acquire()
=> lock_acquire()
WARNING: inconsistent lock state
6.10.0-rc2-syzkaller-00222-gd30d0e49da71 #0 Not tainted
--------------------------------
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
syz-executor.2/10910 [HC1[1]:SC0[0]:HE0:SE1] takes:
ffff8880b9538828 (lock#10){?.+.}-{2:2}, at: local_lock_acquire include/linux/local_lock_internal.h:29 [inline]
ffff8880b9538828 (lock#10){?.+.}-{2:2}, at: __mmap_lock_do_trace_acquire_returned+0x8f/0x630 mm/mmap_lock.c:237
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(lock#10);
<Interrupt>
lock(lock#10);
*** DEADLOCK ***
Reported-by: syzbot <[email protected]>
Closes: https://syzkaller.appspot.com/bug?extid=a225ee3df7e7f9372dbe
Signed-off-by: Tetsuo Handa <[email protected]>
---
Example is https://syzkaller.appspot.com/text?tag=CrashReport&x=1649a2e2980000 .
But not using this example, for this link will disappear eventually.
kernel/bpf/mmap_unlock_work.h | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h
index 5d18d7d85bef..337eb314d918 100644
--- a/kernel/bpf/mmap_unlock_work.h
+++ b/kernel/bpf/mmap_unlock_work.h
@@ -26,7 +26,13 @@ static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **wo
struct mmap_unlock_irq_work *work = NULL;
bool irq_work_busy = false;
- if (irqs_disabled()) {
+ if (in_hardirq()) {
+ /*
+ * IRQ context does not allow to trylock mmap sem.
+ * Force the fallback code.
+ */
+ irq_work_busy = true;
+ } else if (irqs_disabled()) {
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
work = this_cpu_ptr(&mmap_unlock_work);
if (irq_work_is_busy(&work->irq_work)) {
--
2.34.1
On 2024/06/08 19:53, Tetsuo Handa wrote:
> inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
Oops, "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage." example was
found at https://syzkaller.appspot.com/text?tag=CrashReport&x=14f0179a980000 .
Then, do we want to
- if (in_hardirq()) {
+ if (!in_task()) {
instead?
On 2024/06/08 20:04, Tetsuo Handa wrote:
> On 2024/06/08 19:53, Tetsuo Handa wrote:
>> inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
>
> Oops, "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage." example was
> found at https://syzkaller.appspot.com/text?tag=CrashReport&x=14f0179a980000 .
>
> Then, do we want to
>
> - if (in_hardirq()) {
> + if (!in_task()) {
>
> instead?
>
"inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage." upon unlock from IRQ work
was reported at https://syzkaller.appspot.com/bug?extid=40905bca570ae6784745 .
On Fri, Jun 14, 2024 at 8:15 AM Tetsuo Handa
<[email protected]> wrote:
>
> On 2024/06/08 20:04, Tetsuo Handa wrote:
> > On 2024/06/08 19:53, Tetsuo Handa wrote:
> >> inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
> >
> > Oops, "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage." example was
> > found at https://syzkaller.appspot.com/text?tag=CrashReport&x=14f0179a980000 .
> >
> > Then, do we want to
> >
> > - if (in_hardirq()) {
> > + if (!in_task()) {
> >
> > instead?
> >
>
> "inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage." upon unlock from IRQ work
> was reported at https://syzkaller.appspot.com/bug?extid=40905bca570ae6784745 .
imo the issue is elsewhere. syzbot reports:
local_lock_acquire include/linux/local_lock_internal.h:29 [inline]
__mmap_lock_do_trace_released+0x9c/0x620 mm/mmap_lock.c:243
__mmap_lock_trace_released include/linux/mmap_lock.h:42 [inline]
it complains about:
local_lock(&memcg_paths.lock);
in TRACE_MMAP_LOCK_EVENT.
which looks like a false positive.
Hello.
On 2024/06/15 1:40, Alexei Starovoitov wrote:
> On Fri, Jun 14, 2024 at 8:15 AM Tetsuo Handa
> <[email protected]> wrote:
>> "inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage." upon unlock from IRQ work
>> was reported at https://syzkaller.appspot.com/bug?extid=40905bca570ae6784745 .
>
> imo the issue is elsewhere. syzbot reports:
> local_lock_acquire include/linux/local_lock_internal.h:29 [inline]
> __mmap_lock_do_trace_released+0x9c/0x620 mm/mmap_lock.c:243
> __mmap_lock_trace_released include/linux/mmap_lock.h:42 [inline]
>
> it complains about:
> local_lock(&memcg_paths.lock);
> in TRACE_MMAP_LOCK_EVENT.
> which looks like a false positive.
Commit 2b5067a8143e ("mm: mmap_lock: add tracepoints around lock
acquisition") introduced TRACE_MMAP_LOCK_EVENT() macro as below.
#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
do { \
const char *memcg_path; \
preempt_disable(); \
memcg_path = get_mm_memcg_path(mm); \
trace_mmap_lock_##type(mm, \
memcg_path != NULL ? memcg_path : "", \
##__VA_ARGS__); \
if (likely(memcg_path != NULL)) \
put_memcg_path_buf(); \
preempt_enable(); \
} while (0)
Commit d01079f3d0c0 ("mm/mmap_lock: remove dead code for !CONFIG_TRACING
configurations") moved the location of this macro.
Commit 832b50725373 ("mm: mmap_lock: use local locks instead of disabling
preemption") replaced preempt_disable() with local_lock(&memcg_paths.lock)
based on an argument that preempt_disable() has to be avoided because
get_mm_memcg_path() might sleep if PREEMPT_RT=y.
The local_lock() macro is defined as
#define local_lock(lock) __local_lock(lock)
in include/linux/local_lock.h and __local_lock() macro is defined as
#define __local_lock(lock) \
do { \
preempt_disable(); \
local_lock_acquire(this_cpu_ptr(lock)); \
} while (0)
if PREEMPT_RT=n or
#define __local_lock(__lock) \
do { \
migrate_disable(); \
spin_lock(this_cpu_ptr((__lock))); \
} while (0)
if PREEMPT_RT=y in include/linux/local_lock_internal.h .
Note that the difference between preempt_disable() and migrate_disable().
Commit d01079f3d0c0 ("mm/mmap_lock: remove dead code for !CONFIG_TRACING
configurations") by error replaced local_lock() with preempt_disable(),
and commit e904c2ccf9b5 ("mm: mmap_lock: fix disabling preemption
directly") replaced preempt_disable() with local_lock().
Now, syzbot started reporting
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
and
inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
messages, for local_lock() does not disable IRQ.
I think we can replace local_lock() with local_lock_irqsave() like
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 1854850b4b89..1901ad22ccbd 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -156,14 +156,15 @@ static inline void put_memcg_path_buf(void)
#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
do { \
const char *memcg_path; \
- local_lock(&memcg_paths.lock); \
+ unsigned long flags; \
+ local_lock_irqsave(&memcg_paths.lock, flags); \
memcg_path = get_mm_memcg_path(mm); \
trace_mmap_lock_##type(mm, \
memcg_path != NULL ? memcg_path : "", \
##__VA_ARGS__); \
if (likely(memcg_path != NULL)) \
put_memcg_path_buf(); \
- local_unlock(&memcg_paths.lock); \
+ local_unlock_irqrestore(&memcg_paths.lock, flags); \
} while (0)
#else /* !CONFIG_MEMCG */
because local_lock_irqsave() is defined as
#define local_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags)
in include/linux/local_lock.h and __local_lock_irqsave() macro is defined as
#define __local_lock_irqsave(lock, flags) \
do { \
local_irq_save(flags); \
local_lock_acquire(this_cpu_ptr(lock)); \
} while (0)
if PREEMPT_RT=n or
#define __local_lock_irqsave(lock, flags) \
do { \
typecheck(unsigned long, flags); \
flags = 0; \
__local_lock(lock); \
} while (0)
if PREEMPT_RT=y in include/linux/local_lock_internal.h .
But a fundamental question arises here; why do we need to hold
memcg_paths.lock here, for as of commit 44ef20baed8e ("Merge tag
's390-6.10-4' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux"),
"git grep -nF memcg_paths" indicates that memcg_paths.lock is used within
only TRACE_MMAP_LOCK_EVENT() macro.
mm/mmap_lock.c:48:static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = {
mm/mmap_lock.c:63: memcg_path = per_cpu_ptr(&memcg_paths, cpu);
mm/mmap_lock.c:101: rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new);
mm/mmap_lock.c:135: struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths);
mm/mmap_lock.c:152: local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx);
mm/mmap_lock.c:159: local_lock(&memcg_paths.lock); \
mm/mmap_lock.c:166: local_unlock(&memcg_paths.lock); \
That is, what is the reason we can't revert commit 832b50725373 and
make the TRACE_MMAP_LOCK_EVENT() macro to branch directly like below?
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 1854850b4b89..1e440c7d48e6 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -153,19 +153,35 @@ static inline void put_memcg_path_buf(void)
rcu_read_unlock();
}
+#ifndef CONFIG_PREEMPT_RT
#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
do { \
const char *memcg_path; \
- local_lock(&memcg_paths.lock); \
+ preempt_disable(); \
memcg_path = get_mm_memcg_path(mm); \
trace_mmap_lock_##type(mm, \
memcg_path != NULL ? memcg_path : "", \
##__VA_ARGS__); \
if (likely(memcg_path != NULL)) \
put_memcg_path_buf(); \
- local_unlock(&memcg_paths.lock); \
+ preempt_enable(); \
} while (0)
+#else
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ do { \
+ const char *memcg_path; \
+ migrate_disable(); \
+ memcg_path = get_mm_memcg_path(mm); \
+ trace_mmap_lock_##type(mm, \
+ memcg_path != NULL ? memcg_path : "", \
+ ##__VA_ARGS__); \
+ if (likely(memcg_path != NULL)) \
+ put_memcg_path_buf(); \
+ migrate_enable(); \
+ } while (0)
+#endif
+
#else /* !CONFIG_MEMCG */
int trace_mmap_lock_reg(void)
Is the reason because &buf[idx] in get_memcg_path_buf() might become out of
bounds due to preemption in normal context if PREEMPT_RT=y ? If so, can't we
add "idx >=0 && idx < CONTEXT_COUNT" check into get_memcg_path_buf() and
return NULL if preemption (or interrupt or recursion if any) exhausted the per
cpu buffer?
/*
* How many contexts our trace events might be called in: normal, softirq, irq,
* and NMI.
*/
#define CONTEXT_COUNT 4
On 2024/06/15 19:59, Tetsuo Handa wrote:
> Is the reason because &buf[idx] in get_memcg_path_buf() might become out of
> bounds due to preemption in normal context if PREEMPT_RT=y ? If so, can't we
> add "idx >=0 && idx < CONTEXT_COUNT" check into get_memcg_path_buf() and
> return NULL if preemption (or interrupt or recursion if any) exhausted the per
> cpu buffer?
More simply, why not use on-stack buffer, for MEMCG_PATH_BUF_SIZE is only 256?
mm/mmap_lock.c | 175 ++++++-------------------------------------------
1 file changed, 20 insertions(+), 155 deletions(-)
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 1854850b4b89..59165c01c960 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -19,14 +19,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
#ifdef CONFIG_MEMCG
-/*
- * Our various events all share the same buffer (because we don't want or need
- * to allocate a set of buffers *per event type*), so we need to protect against
- * concurrent _reg() and _unreg() calls, and count how many _reg() calls have
- * been made.
- */
-static DEFINE_MUTEX(reg_lock);
-static int reg_refcount; /* Protected by reg_lock. */
+static atomic_t reg_refcount;
/*
* Size of the buffer for memcg path names. Ignoring stack trace support,
@@ -34,136 +27,22 @@ static int reg_refcount; /* Protected by reg_lock. */
*/
#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
-/*
- * How many contexts our trace events might be called in: normal, softirq, irq,
- * and NMI.
- */
-#define CONTEXT_COUNT 4
-
-struct memcg_path {
- local_lock_t lock;
- char __rcu *buf;
- local_t buf_idx;
-};
-static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = {
- .lock = INIT_LOCAL_LOCK(lock),
- .buf_idx = LOCAL_INIT(0),
-};
-
-static char **tmp_bufs;
-
-/* Called with reg_lock held. */
-static void free_memcg_path_bufs(void)
-{
- struct memcg_path *memcg_path;
- int cpu;
- char **old = tmp_bufs;
-
- for_each_possible_cpu(cpu) {
- memcg_path = per_cpu_ptr(&memcg_paths, cpu);
- *(old++) = rcu_dereference_protected(memcg_path->buf,
- lockdep_is_held(®_lock));
- rcu_assign_pointer(memcg_path->buf, NULL);
- }
-
- /* Wait for inflight memcg_path_buf users to finish. */
- synchronize_rcu();
-
- old = tmp_bufs;
- for_each_possible_cpu(cpu) {
- kfree(*(old++));
- }
-
- kfree(tmp_bufs);
- tmp_bufs = NULL;
-}
-
int trace_mmap_lock_reg(void)
{
- int cpu;
- char *new;
-
- mutex_lock(®_lock);
-
- /* If the refcount is going 0->1, proceed with allocating buffers. */
- if (reg_refcount++)
- goto out;
-
- tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs),
- GFP_KERNEL);
- if (tmp_bufs == NULL)
- goto out_fail;
-
- for_each_possible_cpu(cpu) {
- new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL);
- if (new == NULL)
- goto out_fail_free;
- rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new);
- /* Don't need to wait for inflights, they'd have gotten NULL. */
- }
-
-out:
- mutex_unlock(®_lock);
+ atomic_inc(®_refcount);
return 0;
-
-out_fail_free:
- free_memcg_path_bufs();
-out_fail:
- /* Since we failed, undo the earlier ref increment. */
- --reg_refcount;
-
- mutex_unlock(®_lock);
- return -ENOMEM;
}
void trace_mmap_lock_unreg(void)
{
- mutex_lock(®_lock);
-
- /* If the refcount is going 1->0, proceed with freeing buffers. */
- if (--reg_refcount)
- goto out;
-
- free_memcg_path_bufs();
-
-out:
- mutex_unlock(®_lock);
-}
-
-static inline char *get_memcg_path_buf(void)
-{
- struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths);
- char *buf;
- int idx;
-
- rcu_read_lock();
- buf = rcu_dereference(memcg_path->buf);
- if (buf == NULL) {
- rcu_read_unlock();
- return NULL;
- }
- idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) -
- MEMCG_PATH_BUF_SIZE;
- return &buf[idx];
+ atomic_dec(®_refcount);
}
-static inline void put_memcg_path_buf(void)
-{
- local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx);
- rcu_read_unlock();
-}
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- do { \
- const char *memcg_path; \
- local_lock(&memcg_paths.lock); \
- memcg_path = get_mm_memcg_path(mm); \
- trace_mmap_lock_##type(mm, \
- memcg_path != NULL ? memcg_path : "", \
- ##__VA_ARGS__); \
- if (likely(memcg_path != NULL)) \
- put_memcg_path_buf(); \
- local_unlock(&memcg_paths.lock); \
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ do { \
+ char buf[MEMCG_PATH_BUF_SIZE]; \
+ get_mm_memcg_path(mm, buf, sizeof(buf)); \
+ trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
} while (0)
#else /* !CONFIG_MEMCG */
@@ -185,37 +64,23 @@ void trace_mmap_lock_unreg(void)
#ifdef CONFIG_TRACING
#ifdef CONFIG_MEMCG
/*
- * Write the given mm_struct's memcg path to a percpu buffer, and return a
- * pointer to it. If the path cannot be determined, or no buffer was available
- * (because the trace event is being unregistered), NULL is returned.
- *
- * Note: buffers are allocated per-cpu to avoid locking, so preemption must be
- * disabled by the caller before calling us, and re-enabled only after the
- * caller is done with the pointer.
- *
- * The caller must call put_memcg_path_buf() once the buffer is no longer
- * needed. This must be done while preemption is still disabled.
+ * Write the given mm_struct's memcg path to on-stack buffer. If the path cannot be
+ * determined or the trace event is being unregistered, empty string is written.
*/
-static const char *get_mm_memcg_path(struct mm_struct *mm)
+static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
{
- char *buf = NULL;
- struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+ struct mem_cgroup *memcg;
+ buf[0] = '\0';
+ /* No need to get path if no trace event is registered. */
+ if (!atomic_read(®_refcount))
+ return;
+ memcg = get_mem_cgroup_from_mm(mm);
if (memcg == NULL)
- goto out;
- if (unlikely(memcg->css.cgroup == NULL))
- goto out_put;
-
- buf = get_memcg_path_buf();
- if (buf == NULL)
- goto out_put;
-
- cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE);
-
-out_put:
+ return;
+ if (memcg->css.cgroup)
+ cgroup_path(memcg->css.cgroup, buf, buflen);
css_put(&memcg->css);
-out:
- return buf;
}
#endif /* CONFIG_MEMCG */
--
2.18.4