2020-10-07 18:46:49

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH v2 2/2] mmap_lock: add tracepoints around lock acquisition

The goal of these tracepoints is to be able to debug lock contention
issues. This lock is acquired on most (all?) mmap / munmap / page fault
operations, so a multi-threaded process which does a lot of these can
experience significant contention.

We trace just before we start acquisition, when the acquisition returns
(whether it succeeded or not), and when the lock is released (or
downgraded). The events are broken out by lock type (read / write).

The events are also broken out by memcg path. For container-based
workloads, users often think of several processes in a memcg as a single
logical "task", so collecting statistics at this level is useful.

The end goal is to get latency information. This isn't directly included
in the trace events. Instead, users are expected to compute the time
between "start locking" and "acquire returned", using e.g. synthetic
events or BPF. The benefit we get from this is simpler code.

Because we use tracepoint_enabled() to decide whether or not to trace,
this patch has effectively no overhead unless tracepoints are enabled at
runtime. If tracepoints are enabled, there is a performance impact, but
how much depends on exactly what e.g. the BPF program does.

Signed-off-by: Axel Rasmussen <[email protected]>
---
include/linux/mmap_lock.h | 112 ++++++++++++++++++++++++++++---
include/trace/events/mmap_lock.h | 70 +++++++++++++++++++
mm/Makefile | 2 +-
mm/mmap_lock.c | 87 ++++++++++++++++++++++++
4 files changed, 261 insertions(+), 10 deletions(-)
create mode 100644 include/trace/events/mmap_lock.h
create mode 100644 mm/mmap_lock.c

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e1afb420fc94..aa3d64f99d17 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -1,11 +1,95 @@
#ifndef _LINUX_MMAP_LOCK_H
#define _LINUX_MMAP_LOCK_H

+#include <linux/lockdep.h>
+#include <linux/mm_types.h>
#include <linux/mmdebug.h>
+#include <linux/rwsem.h>
+#include <linux/tracepoint-defs.h>
+#include <linux/types.h>

#define MMAP_LOCK_INITIALIZER(name) \
.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),

+DECLARE_TRACEPOINT(mmap_lock_start_locking);
+DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
+DECLARE_TRACEPOINT(mmap_lock_released);
+
+#ifdef CONFIG_TRACING
+
+void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
+void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
+ bool success);
+void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);
+
+#else /* !COFNIG_TRACING */
+
+static inline void __mmap_lock_do_trace_start_locking(struct mm_struct *mm,
+ bool write)
+{
+}
+
+static inline void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm,
+ bool write,
+ bool success)
+{
+}
+
+static inline void __mmap_lock_do_trace_released(struct mm_struct *mm,
+ bool write)
+{
+}
+
+#endif /* CONFIG_TRACING */
+
+static inline bool
+__mmap_lock_traced_trylock(struct mm_struct *mm,
+ int (*trylock)(struct rw_semaphore *), bool write)
+{
+ bool ret;
+
+ if (tracepoint_enabled(mmap_lock_start_locking))
+ __mmap_lock_do_trace_start_locking(mm, write);
+ ret = trylock(&mm->mmap_lock) != 0;
+ if (tracepoint_enabled(mmap_lock_acquire_returned))
+ __mmap_lock_do_trace_acquire_returned(mm, write, ret);
+ return ret;
+}
+
+static inline void __mmap_lock_traced_lock(struct mm_struct *mm,
+ void (*lock)(struct rw_semaphore *),
+ bool write)
+{
+ if (tracepoint_enabled(mmap_lock_start_locking))
+ __mmap_lock_do_trace_start_locking(mm, write);
+ lock(&mm->mmap_lock);
+ if (tracepoint_enabled(mmap_lock_acquire_returned))
+ __mmap_lock_do_trace_acquire_returned(mm, write, true);
+}
+
+static inline int
+__mmap_lock_traced_lock_return(struct mm_struct *mm,
+ int (*lock)(struct rw_semaphore *), bool write)
+{
+ int ret;
+
+ if (tracepoint_enabled(mmap_lock_start_locking))
+ __mmap_lock_do_trace_start_locking(mm, write);
+ ret = lock(&mm->mmap_lock);
+ if (tracepoint_enabled(mmap_lock_acquire_returned))
+ __mmap_lock_do_trace_acquire_returned(mm, write, ret == 0);
+ return ret;
+}
+
+static inline void
+__mmap_lock_traced_unlock(struct mm_struct *mm,
+ void (*unlock)(struct rw_semaphore *), bool write)
+{
+ unlock(&mm->mmap_lock);
+ if (tracepoint_enabled(mmap_lock_released))
+ __mmap_lock_do_trace_released(mm, write);
+}
+
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
@@ -13,58 +97,66 @@ static inline void mmap_init_lock(struct mm_struct *mm)

static inline void mmap_write_lock(struct mm_struct *mm)
{
- down_write(&mm->mmap_lock);
+ __mmap_lock_traced_lock(mm, down_write, true);
}

static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
+ if (tracepoint_enabled(mmap_lock_start_locking))
+ __mmap_lock_do_trace_start_locking(mm, true);
down_write_nested(&mm->mmap_lock, subclass);
+ if (tracepoint_enabled(mmap_lock_acquire_returned))
+ __mmap_lock_do_trace_acquire_returned(mm, true, true);
}

static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
- return down_write_killable(&mm->mmap_lock);
+ return __mmap_lock_traced_lock_return(mm, down_write_killable, true);
}

static inline bool mmap_write_trylock(struct mm_struct *mm)
{
- return down_write_trylock(&mm->mmap_lock) != 0;
+ return __mmap_lock_traced_trylock(mm, down_write_trylock, true);
}

static inline void mmap_write_unlock(struct mm_struct *mm)
{
- up_write(&mm->mmap_lock);
+ __mmap_lock_traced_unlock(mm, up_write, true);
}

static inline void mmap_write_downgrade(struct mm_struct *mm)
{
downgrade_write(&mm->mmap_lock);
+ if (tracepoint_enabled(mmap_lock_acquire_returned))
+ __mmap_lock_do_trace_acquire_returned(mm, false, true);
}

static inline void mmap_read_lock(struct mm_struct *mm)
{
- down_read(&mm->mmap_lock);
+ __mmap_lock_traced_lock(mm, down_read, false);
}

static inline int mmap_read_lock_killable(struct mm_struct *mm)
{
- return down_read_killable(&mm->mmap_lock);
+ return __mmap_lock_traced_lock_return(mm, down_read_killable, false);
}

static inline bool mmap_read_trylock(struct mm_struct *mm)
{
- return down_read_trylock(&mm->mmap_lock) != 0;
+ return __mmap_lock_traced_trylock(mm, down_read_trylock, false);
}

static inline void mmap_read_unlock(struct mm_struct *mm)
{
- up_read(&mm->mmap_lock);
+ __mmap_lock_traced_unlock(mm, up_read, false);
}

static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
{
- if (down_read_trylock(&mm->mmap_lock)) {
+ if (mmap_read_trylock(mm)) {
rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
+ if (tracepoint_enabled(mmap_lock_released))
+ __mmap_lock_do_trace_released(mm, false);
return true;
}
return false;
@@ -73,6 +165,8 @@ static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
{
up_read_non_owner(&mm->mmap_lock);
+ if (tracepoint_enabled(mmap_lock_released))
+ __mmap_lock_do_trace_released(mm, false);
}

static inline void mmap_assert_locked(struct mm_struct *mm)
diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h
new file mode 100644
index 000000000000..ca652b52510e
--- /dev/null
+++ b/include/trace/events/mmap_lock.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mmap_lock
+
+#if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MMAP_LOCK_H
+
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+
+struct mm_struct;
+
+DECLARE_EVENT_CLASS(
+ mmap_lock_template,
+
+ TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write,
+ bool success),
+
+ TP_ARGS(mm, memcg_path, write, success),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __string(memcg_path, memcg_path)
+ __field(bool, write)
+ __field(bool, success)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __assign_str(memcg_path, memcg_path);
+ __entry->write = write;
+ __entry->success = success;
+ ),
+
+ TP_printk(
+ "mm=%p memcg_path=%s write=%s success=%s\n",
+ __entry->mm,
+ __get_str(memcg_path),
+ __entry->write ? "true" : "false",
+ __entry->success ? "true" : "false")
+ );
+
+DEFINE_EVENT(mmap_lock_template, mmap_lock_start_locking,
+
+ TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write,
+ bool success),
+
+ TP_ARGS(mm, memcg_path, write, success)
+);
+
+DEFINE_EVENT(mmap_lock_template, mmap_lock_acquire_returned,
+
+ TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write,
+ bool success),
+
+ TP_ARGS(mm, memcg_path, write, success)
+);
+
+DEFINE_EVENT(mmap_lock_template, mmap_lock_released,
+
+ TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write,
+ bool success),
+
+ TP_ARGS(mm, memcg_path, write, success)
+);
+
+#endif /* _TRACE_MMAP_LOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/Makefile b/mm/Makefile
index 069f216e109e..b6cd2fffa492 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o $(mmu-y)
+ debug.o gup.o mmap_lock.o $(mmu-y)

# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
new file mode 100644
index 000000000000..b849287bd12a
--- /dev/null
+++ b/mm/mmap_lock.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+#define CREATE_TRACE_POINTS
+#include <trace/events/mmap_lock.h>
+
+#include <linux/mm.h>
+#include <linux/cgroup.h>
+#include <linux/memcontrol.h>
+#include <linux/mmap_lock.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+#include <linux/trace_events.h>
+
+/*
+ * We have to export these, as drivers use mmap_lock, and our inline functions
+ * in the header check if the tracepoint is enabled. They can't be GPL, as e.g.
+ * the nvidia driver is an existing caller of this code.
+ */
+EXPORT_SYMBOL(__tracepoint_mmap_lock_start_locking);
+EXPORT_SYMBOL(__tracepoint_mmap_lock_acquire_returned);
+EXPORT_SYMBOL(__tracepoint_mmap_lock_released);
+
+#ifdef CONFIG_MEMCG
+
+DEFINE_PER_CPU(char[MAX_FILTER_STR_VAL], trace_memcg_path);
+
+/*
+ * Write the given mm_struct's memcg path to a percpu buffer, and return a
+ * pointer to it. If the path cannot be determined, the buffer will contain the
+ * empty string.
+ *
+ * Note: buffers are allocated per-cpu to avoid locking, so preemption must be
+ * disabled by the caller before calling us, and re-enabled only after the
+ * caller is done with the pointer.
+ */
+static const char *get_mm_memcg_path(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+
+ if (memcg != NULL && likely(memcg->css.cgroup != NULL)) {
+ char *buf = this_cpu_ptr(trace_memcg_path);
+
+ cgroup_path(memcg->css.cgroup, buf, MAX_FILTER_STR_VAL);
+ return buf;
+ }
+ return "";
+}
+
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ do { \
+ if (trace_mmap_lock_##type##_enabled()) { \
+ get_cpu(); \
+ trace_mmap_lock_##type(mm, get_mm_memcg_path(mm), \
+ ##__VA_ARGS__); \
+ put_cpu(); \
+ } \
+ } while (0)
+
+#else /* !CONFIG_MEMCG */
+
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
+
+#endif /* CONFIG_MEMCG */
+
+/*
+ * Trace calls must be in a separate file, as otherwise there's a circular
+ * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
+ */
+
+void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
+{
+ TRACE_MMAP_LOCK_EVENT(start_locking, mm, write, true);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
+
+void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
+ bool success)
+{
+ TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
+
+void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
+{
+ TRACE_MMAP_LOCK_EVENT(released, mm, write, true);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_released);
--
2.28.0.806.g8561365e88-goog


2020-10-08 07:41:57

by Michel Lespinasse

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] mmap_lock: add tracepoints around lock acquisition

On Wed, Oct 7, 2020 at 11:44 AM Axel Rasmussen <[email protected]> wrote:
> The goal of these tracepoints is to be able to debug lock contention
> issues. This lock is acquired on most (all?) mmap / munmap / page fault
> operations, so a multi-threaded process which does a lot of these can
> experience significant contention.
>
> We trace just before we start acquisition, when the acquisition returns
> (whether it succeeded or not), and when the lock is released (or
> downgraded). The events are broken out by lock type (read / write).
>
> The events are also broken out by memcg path. For container-based
> workloads, users often think of several processes in a memcg as a single
> logical "task", so collecting statistics at this level is useful.
>
> The end goal is to get latency information. This isn't directly included
> in the trace events. Instead, users are expected to compute the time
> between "start locking" and "acquire returned", using e.g. synthetic
> events or BPF. The benefit we get from this is simpler code.
>
> Because we use tracepoint_enabled() to decide whether or not to trace,
> this patch has effectively no overhead unless tracepoints are enabled at
> runtime. If tracepoints are enabled, there is a performance impact, but
> how much depends on exactly what e.g. the BPF program does.
>
> Signed-off-by: Axel Rasmussen <[email protected]>

Thanks for working on this.

I like that there is no overhead unless CONFIG_TRACING is set.
However, I think the __mmap_lock_traced_lock and similar functions are
the wrong level of abstraction, especially considering that we are
considering to switch away from using rwsem as the underlying lock
implementation. Would you consider something along the following lines
instead for include/linux/mmap_lock.h ?

#ifdef CONFIG_TRACING

DECLARE_TRACEPOINT(...);

void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);

static inline void mmap_lock_trace_start_locking(struct mm_struct *mm,
bool write)
{
if (tracepoint_enabled(mmap_lock_start_locking))
__mmap_lock_do_trace_start_locking(mm, write);
}

#else

static inline void mmap_lock_trace_start_locking(struct mm_struct *mm,
bool write) {}

#endif

static inline void mmap_write_lock(struct mm_struct *mm)
{
mmap_lock_trace_start_locking(mm, true);
down_write(&mm->mmap_lock);
mmap_lock_trace_acquire_returned(mm, true, true);
}

I think this is more straightforward, and also the
mmap_lock_trace_start_locking and similar functions don't depend on
the underlying lock implementation.

The changes to the other files look fine to me.

--
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

2020-10-08 18:57:25

by Axel Rasmussen

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] mmap_lock: add tracepoints around lock acquisition

On Thu, Oct 8, 2020 at 12:40 AM Michel Lespinasse <[email protected]> wrote:
>
> On Wed, Oct 7, 2020 at 11:44 AM Axel Rasmussen <[email protected]> wrote:
> > The goal of these tracepoints is to be able to debug lock contention
> > issues. This lock is acquired on most (all?) mmap / munmap / page fault
> > operations, so a multi-threaded process which does a lot of these can
> > experience significant contention.
> >
> > We trace just before we start acquisition, when the acquisition returns
> > (whether it succeeded or not), and when the lock is released (or
> > downgraded). The events are broken out by lock type (read / write).
> >
> > The events are also broken out by memcg path. For container-based
> > workloads, users often think of several processes in a memcg as a single
> > logical "task", so collecting statistics at this level is useful.
> >
> > The end goal is to get latency information. This isn't directly included
> > in the trace events. Instead, users are expected to compute the time
> > between "start locking" and "acquire returned", using e.g. synthetic
> > events or BPF. The benefit we get from this is simpler code.
> >
> > Because we use tracepoint_enabled() to decide whether or not to trace,
> > this patch has effectively no overhead unless tracepoints are enabled at
> > runtime. If tracepoints are enabled, there is a performance impact, but
> > how much depends on exactly what e.g. the BPF program does.
> >
> > Signed-off-by: Axel Rasmussen <[email protected]>
>
> Thanks for working on this.
>
> I like that there is no overhead unless CONFIG_TRACING is set.

Actually, it's even a bit better. Even if CONFIG_TRACING is set, the
only overhead we add is a single static_key branch (much cheaper than
a normal branch). The overhead is only introduced when the tracepoints
are enabled at runtime, e.g. by writing to
/sys/kernel/trace/events/(event)/enabled.


> However, I think the __mmap_lock_traced_lock and similar functions are
> the wrong level of abstraction, especially considering that we are
> considering to switch away from using rwsem as the underlying lock
> implementation. Would you consider something along the following lines
> instead for include/linux/mmap_lock.h ?

Sure, there's no reason that can't work. It's just a tradeoff:

Breaking up these helpers the way you describe makes the locking
functions more verbose (more repeated code; overall this file would be
longer), but there are a few less lines to change when we switch from
rwsem to something else (because we'd only need to change the lock
functions vs. as-is we'd have to change a couple of lines in each of
the helpers too).

I prefer the way it is, but I don't feel that strongly about it. I'll
refactor and send a v3 that looks as you describe.

>
> #ifdef CONFIG_TRACING
>
> DECLARE_TRACEPOINT(...);
>
> void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
>
> static inline void mmap_lock_trace_start_locking(struct mm_struct *mm,
> bool write)
> {
> if (tracepoint_enabled(mmap_lock_start_locking))
> __mmap_lock_do_trace_start_locking(mm, write);
> }
>
> #else
>
> static inline void mmap_lock_trace_start_locking(struct mm_struct *mm,
> bool write) {}
>
> #endif
>
> static inline void mmap_write_lock(struct mm_struct *mm)
> {
> mmap_lock_trace_start_locking(mm, true);
> down_write(&mm->mmap_lock);
> mmap_lock_trace_acquire_returned(mm, true, true);
> }
>
> I think this is more straightforward, and also the
> mmap_lock_trace_start_locking and similar functions don't depend on
> the underlying lock implementation.
>
> The changes to the other files look fine to me.
>
> --
> Michel "Walken" Lespinasse
> A program is never fully debugged until the last user dies.