2020-08-03 14:51:18

by Kalesh Singh

[permalink] [raw]
Subject: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

Being able to analyze the per process usage of shared
dma buffers prodives useful insights in situations where
the system is experiencing high memory pressure. This would
allow us to see exactly which processes are holding references
to the shared buffer.

Signed-off-by: Kalesh Singh <[email protected]>
---
drivers/dma-buf/dma-buf.c | 29 +++++++++++++
include/trace/events/dma_buf.h | 77 ++++++++++++++++++++++++++++++++++
2 files changed, 106 insertions(+)
create mode 100644 include/trace/events/dma_buf.h

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 1ca609f66fdf..1729191ac9ca 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -29,6 +29,9 @@
#include <uapi/linux/dma-buf.h>
#include <uapi/linux/magic.h>

+#define CREATE_TRACE_POINTS
+#include <trace/events/dma_buf.h>
+
static inline int is_dma_buf_file(struct file *);

struct dma_buf_list {
@@ -110,6 +113,15 @@ static struct file_system_type dma_buf_fs_type = {
.kill_sb = kill_anon_super,
};

+static void dma_buf_vma_close(struct vm_area_struct *area)
+{
+ trace_dma_buf_map_ref_dec(current, area->vm_file);
+}
+
+static const struct vm_operations_struct dma_buf_vm_ops = {
+ .close = dma_buf_vma_close,
+};
+
static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
{
struct dma_buf *dmabuf;
@@ -128,6 +140,9 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
dmabuf->size >> PAGE_SHIFT)
return -EINVAL;

+ trace_dma_buf_map_ref_inc(current, file);
+ vma->vm_ops = &dma_buf_vm_ops;
+
return dmabuf->ops->mmap(dmabuf, vma);
}

@@ -410,6 +425,17 @@ static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file)
spin_unlock(&dmabuf->name_lock);
}

+static int dma_buf_flush(struct file *filp, fl_owner_t id)
+{
+ trace_dma_buf_fd_ref_dec(current, filp);
+ return 0;
+}
+
+static void dma_buf_fd_install(int fd, struct file *filp)
+{
+ trace_dma_buf_fd_ref_inc(current, filp);
+}
+
static const struct file_operations dma_buf_fops = {
.mmap = dma_buf_mmap_internal,
.llseek = dma_buf_llseek,
@@ -417,6 +443,8 @@ static const struct file_operations dma_buf_fops = {
.unlocked_ioctl = dma_buf_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.show_fdinfo = dma_buf_show_fdinfo,
+ .fd_install = dma_buf_fd_install,
+ .flush = dma_buf_flush,
};

/*
@@ -1177,6 +1205,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
if (oldfile)
fput(oldfile);
}
+
return ret;

}
diff --git a/include/trace/events/dma_buf.h b/include/trace/events/dma_buf.h
new file mode 100644
index 000000000000..05af336cd849
--- /dev/null
+++ b/include/trace/events/dma_buf.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dma_buf
+
+#if !defined(_TRACE_DMA_BUF_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_DMA_BUF_H
+
+#include <linux/dma-buf.h>
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+
+#define UNKNOWN "<unknown>"
+
+#ifdef CREATE_TRACE_POINTS
+static inline struct dma_buf *dma_buffer(struct file *filp)
+{
+ return filp->private_data;
+}
+#endif
+
+DECLARE_EVENT_CLASS(dma_buf_ref_template,
+
+ TP_PROTO(struct task_struct *task, struct file *filp),
+
+ TP_ARGS(task, filp),
+
+ TP_STRUCT__entry(
+ __field(u32, tgid)
+ __field(u32, pid)
+ __field(u64, size)
+ __field(s64, count)
+ __string(exp_name, dma_buffer(filp)->exp_name)
+ __string(name, dma_buffer(filp)->name ? dma_buffer(filp)->name : UNKNOWN)
+ __field(u64, i_ino)
+ ),
+
+ TP_fast_assign(
+ __entry->tgid = task->tgid;
+ __entry->pid = task->pid;
+ __entry->size = dma_buffer(filp)->size;
+ __entry->count = file_count(filp);
+ __assign_str(exp_name, dma_buffer(filp)->exp_name);
+ __assign_str(name, dma_buffer(filp)->name ? dma_buffer(filp)->name : UNKNOWN);
+ __entry->i_ino = filp->f_inode->i_ino;
+ ),
+
+ TP_printk("tgid=%u pid=%u size=%llu count=%lld exp_name=%s name=%s i_ino=%llu",
+ __entry->tgid,
+ __entry->pid,
+ __entry->size,
+ __entry->count,
+ __get_str(exp_name),
+ __get_str(name),
+ __entry->i_ino
+ )
+);
+
+DEFINE_EVENT(dma_buf_ref_template, dma_buf_fd_ref_inc,
+ TP_PROTO(struct task_struct *task, struct file *filp),
+ TP_ARGS(task, filp));
+
+DEFINE_EVENT(dma_buf_ref_template, dma_buf_fd_ref_dec,
+ TP_PROTO(struct task_struct *task, struct file *filp),
+ TP_ARGS(task, filp));
+
+DEFINE_EVENT(dma_buf_ref_template, dma_buf_map_ref_inc,
+ TP_PROTO(struct task_struct *task, struct file *filp),
+ TP_ARGS(task, filp));
+
+DEFINE_EVENT(dma_buf_ref_template, dma_buf_map_ref_dec,
+ TP_PROTO(struct task_struct *task, struct file *filp),
+ TP_ARGS(task, filp));
+
+#endif /* _TRACE_DMA_BUF_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--
2.28.0.163.g6104cc2f0b6-goog


2020-08-03 15:33:16

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Mon, 3 Aug 2020 14:47:19 +0000
Kalesh Singh <[email protected]> wrote:

> +DECLARE_EVENT_CLASS(dma_buf_ref_template,
> +
> + TP_PROTO(struct task_struct *task, struct file *filp),
> +
> + TP_ARGS(task, filp),
> +
> + TP_STRUCT__entry(
> + __field(u32, tgid)
> + __field(u32, pid)

I only see "current" passed in as "task". Why are you recording the pid
and tgid as these are available by the tracing infrastructure.

At least the pid is saved at every event. You can see the tgid when
enabling the "record_tgid".

# trace-cmd start -e all -O record_tgid
# trace-cmd show

# tracer: nop
#
# entries-in-buffer/entries-written: 39750/39750 #P:8
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / delay
# TASK-PID TGID CPU# |||| TIMESTAMP FUNCTION
# | | | | |||| | |
trace-cmd-28284 (28284) [005] .... 240338.934671: sys_exit: NR 1 = 1
kworker/3:2-27891 (27891) [003] d... 240338.934671: timer_start: timer=00000000d643debd function=delayed_work_timer_fn expires=4535008893 [timeout=1981] cpu=3 idx=186 flags=I
trace-cmd-28284 (28284) [005] .... 240338.934672: sys_write -> 0x1
kworker/3:2-27891 (27891) [003] .... 240338.934672: workqueue_execute_end: work struct 000000008fddd403: function psi_avgs_work
kworker/3:2-27891 (27891) [003] .... 240338.934673: workqueue_execute_start: work struct 00000000111c941e: function dbs_work_handler
kworker/3:2-27891 (27891) [003] .... 240338.934673: workqueue_execute_end: work struct 00000000111c941e: function dbs_work_handler
kworker/3:2-27891 (27891) [003] d... 240338.934673: rcu_utilization: Start context switch
kworker/3:2-27891 (27891) [003] d... 240338.934673: rcu_utilization: End context switch

-- Steve

> + __field(u64, size)
> + __field(s64, count)
> + __string(exp_name, dma_buffer(filp)->exp_name)
> + __string(name, dma_buffer(filp)->name ? dma_buffer(filp)->name : UNKNOWN)
> + __field(u64, i_ino)
> + ),
> +
> + TP_fast_assign(
> + __entry->tgid = task->tgid;
> + __entry->pid = task->pid;
> + __entry->size = dma_buffer(filp)->size;
> + __entry->count = file_count(filp);
> + __assign_str(exp_name, dma_buffer(filp)->exp_name);
> + __assign_str(name, dma_buffer(filp)->name ? dma_buffer(filp)->name : UNKNOWN);
> + __entry->i_ino = filp->f_inode->i_ino;
> + ),
> +

2020-08-03 15:44:52

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Mon, Aug 03, 2020 at 02:47:19PM +0000, Kalesh Singh wrote:
> +static void dma_buf_fd_install(int fd, struct file *filp)
> +{
> + trace_dma_buf_fd_ref_inc(current, filp);
> +}

You're adding a new file_operation in order to just add a new tracepoint?
NACK.

2020-08-03 16:00:46

by Suren Baghdasaryan

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Mon, Aug 3, 2020 at 8:41 AM Matthew Wilcox <[email protected]> wrote:
>
> On Mon, Aug 03, 2020 at 02:47:19PM +0000, Kalesh Singh wrote:
> > +static void dma_buf_fd_install(int fd, struct file *filp)
> > +{
> > + trace_dma_buf_fd_ref_inc(current, filp);
> > +}
>
> You're adding a new file_operation in order to just add a new tracepoint?
> NACK.

Hi Matthew,
The plan is to attach a BPF to this tracepoint in order to track
dma-buf users. If you feel this is an overkill, what would you suggest
as an alternative?

2020-08-03 16:13:29

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Mon, Aug 03, 2020 at 09:00:00AM -0700, Suren Baghdasaryan wrote:
> On Mon, Aug 3, 2020 at 8:41 AM Matthew Wilcox <[email protected]> wrote:
> >
> > On Mon, Aug 03, 2020 at 02:47:19PM +0000, Kalesh Singh wrote:
> > > +static void dma_buf_fd_install(int fd, struct file *filp)
> > > +{
> > > + trace_dma_buf_fd_ref_inc(current, filp);
> > > +}
> >
> > You're adding a new file_operation in order to just add a new tracepoint?
> > NACK.
>
> Hi Matthew,
> The plan is to attach a BPF to this tracepoint in order to track
> dma-buf users. If you feel this is an overkill, what would you suggest
> as an alternative?

I'm sure BPF can attach to fd_install and filter on file->f_ops belonging
to dma_buf, for example.

2020-08-03 16:23:45

by Suren Baghdasaryan

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Mon, Aug 3, 2020 at 9:12 AM Matthew Wilcox <[email protected]> wrote:
>
> On Mon, Aug 03, 2020 at 09:00:00AM -0700, Suren Baghdasaryan wrote:
> > On Mon, Aug 3, 2020 at 8:41 AM Matthew Wilcox <[email protected]> wrote:
> > >
> > > On Mon, Aug 03, 2020 at 02:47:19PM +0000, Kalesh Singh wrote:
> > > > +static void dma_buf_fd_install(int fd, struct file *filp)
> > > > +{
> > > > + trace_dma_buf_fd_ref_inc(current, filp);
> > > > +}
> > >
> > > You're adding a new file_operation in order to just add a new tracepoint?
> > > NACK.
> >
> > Hi Matthew,
> > The plan is to attach a BPF to this tracepoint in order to track
> > dma-buf users. If you feel this is an overkill, what would you suggest
> > as an alternative?
>
> I'm sure BPF can attach to fd_install and filter on file->f_ops belonging
> to dma_buf, for example.

Sounds like a workable solution. Will explore that direction. Thanks Matthew!

2020-08-03 16:34:21

by Kalesh Singh

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Mon, Aug 03, 2020 at 11:32:39AM -0400, Steven Rostedt wrote:
> On Mon, 3 Aug 2020 14:47:19 +0000
> Kalesh Singh <[email protected]> wrote:
>
> > +DECLARE_EVENT_CLASS(dma_buf_ref_template,
> > +
> > + TP_PROTO(struct task_struct *task, struct file *filp),
> > +
> > + TP_ARGS(task, filp),
> > +
> > + TP_STRUCT__entry(
> > + __field(u32, tgid)
> > + __field(u32, pid)
>
> I only see "current" passed in as "task". Why are you recording the pid
> and tgid as these are available by the tracing infrastructure.
>
> At least the pid is saved at every event. You can see the tgid when
> enabling the "record_tgid".
>
> # trace-cmd start -e all -O record_tgid
> # trace-cmd show
>
> # tracer: nop
> #
> # entries-in-buffer/entries-written: 39750/39750 #P:8
> #
> # _-----=> irqs-off
> # / _----=> need-resched
> # | / _---=> hardirq/softirq
> # || / _--=> preempt-depth
> # ||| / delay
> # TASK-PID TGID CPU# |||| TIMESTAMP FUNCTION
> # | | | | |||| | |
> trace-cmd-28284 (28284) [005] .... 240338.934671: sys_exit: NR 1 = 1
> kworker/3:2-27891 (27891) [003] d... 240338.934671: timer_start: timer=00000000d643debd function=delayed_work_timer_fn expires=4535008893 [timeout=1981] cpu=3 idx=186 flags=I
> trace-cmd-28284 (28284) [005] .... 240338.934672: sys_write -> 0x1
> kworker/3:2-27891 (27891) [003] .... 240338.934672: workqueue_execute_end: work struct 000000008fddd403: function psi_avgs_work
> kworker/3:2-27891 (27891) [003] .... 240338.934673: workqueue_execute_start: work struct 00000000111c941e: function dbs_work_handler
> kworker/3:2-27891 (27891) [003] .... 240338.934673: workqueue_execute_end: work struct 00000000111c941e: function dbs_work_handler
> kworker/3:2-27891 (27891) [003] d... 240338.934673: rcu_utilization: Start context switch
> kworker/3:2-27891 (27891) [003] d... 240338.934673: rcu_utilization: End context switch
>
> -- Steve
>
Thanks for the comments Steve. I'll remove the task arg.

> > + __field(u64, size)
> > + __field(s64, count)
> > + __string(exp_name, dma_buffer(filp)->exp_name)
> > + __string(name, dma_buffer(filp)->name ? dma_buffer(filp)->name : UNKNOWN)
> > + __field(u64, i_ino)
> > + ),
> > +
> > + TP_fast_assign(
> > + __entry->tgid = task->tgid;
> > + __entry->pid = task->pid;
> > + __entry->size = dma_buffer(filp)->size;
> > + __entry->count = file_count(filp);
> > + __assign_str(exp_name, dma_buffer(filp)->exp_name);
> > + __assign_str(name, dma_buffer(filp)->name ? dma_buffer(filp)->name : UNKNOWN);
> > + __entry->i_ino = filp->f_inode->i_ino;
> > + ),
> > +

2020-08-03 22:31:25

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Mon, Aug 03, 2020 at 09:22:53AM -0700, Suren Baghdasaryan wrote:
> On Mon, Aug 3, 2020 at 9:12 AM Matthew Wilcox <[email protected]> wrote:
> >
> > On Mon, Aug 03, 2020 at 09:00:00AM -0700, Suren Baghdasaryan wrote:
> > > On Mon, Aug 3, 2020 at 8:41 AM Matthew Wilcox <[email protected]> wrote:
> > > >
> > > > On Mon, Aug 03, 2020 at 02:47:19PM +0000, Kalesh Singh wrote:
> > > > > +static void dma_buf_fd_install(int fd, struct file *filp)
> > > > > +{
> > > > > + trace_dma_buf_fd_ref_inc(current, filp);
> > > > > +}
> > > >
> > > > You're adding a new file_operation in order to just add a new tracepoint?
> > > > NACK.
> > >
> > > Hi Matthew,
> > > The plan is to attach a BPF to this tracepoint in order to track
> > > dma-buf users. If you feel this is an overkill, what would you suggest
> > > as an alternative?
> >
> > I'm sure BPF can attach to fd_install and filter on file->f_ops belonging
> > to dma_buf, for example.
>
> Sounds like a workable solution. Will explore that direction. Thanks Matthew!

No, it is not a solution at all.

What kind of locking would you use? With _any_ of those approaches.

How would you use the information that is hopelessly out of date/incoherent/whatnot
at the very moment you obtain it?

IOW, what the hell is that horror for? You do realize, for example, that there's
such thing as dup(), right? And dup2() as well. And while we are at it, how
do you keep track of removals, considering the fact that you can stick a file
reference into SCM_RIGHTS datagram sent to yourself, close descriptors and an hour
later pick that datagram, suddenly getting descriptor back?

Besides, "I have no descriptors left" != "I can't be currently sitting in the middle
of syscall on that sucker"; close() does *NOT* terminate ongoing operations.

You are looking at the drastically wrong abstraction level. Please, describe what
it is that you are trying to achieve.

2020-08-04 01:09:54

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Mon, Aug 03, 2020 at 11:28:31PM +0100, Al Viro wrote:

> IOW, what the hell is that horror for? You do realize, for example, that there's
> such thing as dup(), right? And dup2() as well. And while we are at it, how
> do you keep track of removals, considering the fact that you can stick a file
> reference into SCM_RIGHTS datagram sent to yourself, close descriptors and an hour
> later pick that datagram, suddenly getting descriptor back?
>
> Besides, "I have no descriptors left" != "I can't be currently sitting in the middle
> of syscall on that sucker"; close() does *NOT* terminate ongoing operations.
>
> You are looking at the drastically wrong abstraction level. Please, describe what
> it is that you are trying to achieve.

_IF_ it's "who keeps a particularly long-lived sucker pinned", I would suggest
fuser(1) run when you detect that kind of long-lived dmabuf. With events generated
by their constructors and destructors, and detection of longevity done based on
that.

But that's only a semi-blind guess at the things you are trying to achieve; please,
describe what it really is.

2020-08-04 02:11:58

by Suren Baghdasaryan

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Mon, Aug 3, 2020 at 6:09 PM Al Viro <[email protected]> wrote:
>
> On Mon, Aug 03, 2020 at 11:28:31PM +0100, Al Viro wrote:
>
> > IOW, what the hell is that horror for? You do realize, for example, that there's
> > such thing as dup(), right? And dup2() as well. And while we are at it, how
> > do you keep track of removals, considering the fact that you can stick a file
> > reference into SCM_RIGHTS datagram sent to yourself, close descriptors and an hour
> > later pick that datagram, suddenly getting descriptor back?
> >
> > Besides, "I have no descriptors left" != "I can't be currently sitting in the middle
> > of syscall on that sucker"; close() does *NOT* terminate ongoing operations.

Thanks for your feedback, Al. I see your points and sorry for not
realizing these shortcomings.

> >
> > You are looking at the drastically wrong abstraction level. Please, describe what
> > it is that you are trying to achieve.
>
> _IF_ it's "who keeps a particularly long-lived sucker pinned", I would suggest
> fuser(1) run when you detect that kind of long-lived dmabuf. With events generated
> by their constructors and destructors, and detection of longevity done based on
> that.

That is the intention here. IIUC fuser(1) would require root access to
collect this information from a process other than the caller. Ideally
what we would like to have is a non-root process with specific
capabilities (in our case a process that can access BPF maps) to be
able to obtain the information on dma-buf users.
However, it might make more sense to track dma-buf usage from
dma_buf_getfile, dma_buf_get and dma_buf_put since these calls are the
ones that affect file refcount. Will dig some more into this.
Thanks for your time and sorry for not thinking it through beforehand.

>
> But that's only a semi-blind guess at the things you are trying to achieve; please,
> describe what it really is.

2020-08-04 15:45:51

by Kalesh Singh

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Tue, Aug 04, 2020 at 02:09:13AM +0100, Al Viro wrote:
> On Mon, Aug 03, 2020 at 11:28:31PM +0100, Al Viro wrote:
>
> > IOW, what the hell is that horror for? You do realize, for example, that there's
> > such thing as dup(), right? And dup2() as well. And while we are at it, how
> > do you keep track of removals, considering the fact that you can stick a file
> > reference into SCM_RIGHTS datagram sent to yourself, close descriptors and an hour
> > later pick that datagram, suddenly getting descriptor back?
> >
> > Besides, "I have no descriptors left" != "I can't be currently sitting in the middle
> > of syscall on that sucker"; close() does *NOT* terminate ongoing operations.
> >
> > You are looking at the drastically wrong abstraction level. Please, describe what
> > it is that you are trying to achieve.

Hi Al. Thank you for the comments. Ultimately what we need is to identify processes
that hold a file reference to the dma-buf. Unfortunately we can't use only
explicit dma_buf_get/dma_buf_put to track them because when an FD is being shared
between processes the file references are taken implicitly.

For example, on the sender side:
unix_dgram_sendmsg -> send_scm -> __send_scm -> scm_fp_copy -> fget_raw
and on the receiver side:
unix_dgram_recvmsg -> scm_recv -> scm_detach_fds -> __scm_install_fd -> get_file

I understand now that fd_install is not an appropriate abstraction level to track these.
Is there a more appropriate alternative where we could use to track these implicit file
references?

> _IF_ it's "who keeps a particularly long-lived sucker pinned", I would suggest
> fuser(1) run when you detect that kind of long-lived dmabuf. With events generated
> by their constructors and destructors, and detection of longevity done based on
> that.
>
> But that's only a semi-blind guess at the things you are trying to achieve; please,
> describe what it really is.

2020-08-04 18:30:10

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Tue, Aug 04, 2020 at 03:44:51PM +0000, Kalesh Singh wrote:

> Hi Al. Thank you for the comments. Ultimately what we need is to identify processes
> that hold a file reference to the dma-buf. Unfortunately we can't use only
> explicit dma_buf_get/dma_buf_put to track them because when an FD is being shared
> between processes the file references are taken implicitly.
>
> For example, on the sender side:
> unix_dgram_sendmsg -> send_scm -> __send_scm -> scm_fp_copy -> fget_raw
> and on the receiver side:
> unix_dgram_recvmsg -> scm_recv -> scm_detach_fds -> __scm_install_fd -> get_file
>
> I understand now that fd_install is not an appropriate abstraction level to track these.
> Is there a more appropriate alternative where we could use to track these implicit file
> references?

There is no single lock that would stabilize the descriptor tables of all
processes. And there's not going to be one, ever - it would be a contention
point from hell, since that would've been a system-wide lock that would have
to be taken by *ALL* syscalls modifying any descriptor table. Not going to
happen, for obvious reasons. Moreover, you would have to have fork(2) take
the same lock, since it does copy descriptor table. And clone(2) either does
the same, or has the child share the descriptor table of parent.

What's more, a reference to struct file can bloody well survive without
a single descriptor refering to that file. In the example you've mentioned
above, sender has ever right to close all descriptors it has sent. Files
will stay opened as long as the references are held in the datagram; when
that datagram is received, the references will be inserted into recepient's
descriptor table. At that point you again have descriptors refering to
that file, can do any IO on it, etc.

So "the set of processes that hold a file reference to the dma-buf" is
* inherently unstable, unless you are willing to freeze every
process in the system except for the one trying to find that set.
* can remain empty for any amount of time (hours, weeks, whatever),
only to get non-empty later, with syscalls affecting the object in question
done afterwards.

So... what were you going to do with that set if you could calculate it?
If it's really "how do we debug a leak?", it's one thing; in that case
I would suggest keeping track of creation/destruction of objects (not
gaining/dropping references - actual constructors and destructors) to
see what gets stuck around for too long and use fuser(1) to try and locate
the culprits if you see that something *was* living for too long. "Try"
since the only reference might indeed have been stashed into an SCM_RIGHTS
datagram sitting in a queue of some AF_UNIX socket. Note that "fuser
needs elevated priveleges" is not a strong argument - the ability to
do that sort of tracking does imply elevated priveleges anyway, and
having a root process taking requests along the lines of "gimme the
list of PIDs that have such-and-such dma_buf in their descriptor table"
is not much of an attack surface.

If you want to use it for something else, you'll need to describe that
intended use; there might be sane ways to do that, but it's hard to
come up with one without knowing what's being attempted...

2020-08-04 20:29:50

by Daniel Vetter

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Tue, Aug 4, 2020 at 12:28 AM Al Viro <[email protected]> wrote:
>
> On Mon, Aug 03, 2020 at 09:22:53AM -0700, Suren Baghdasaryan wrote:
> > On Mon, Aug 3, 2020 at 9:12 AM Matthew Wilcox <[email protected]> wrote:
> > >
> > > On Mon, Aug 03, 2020 at 09:00:00AM -0700, Suren Baghdasaryan wrote:
> > > > On Mon, Aug 3, 2020 at 8:41 AM Matthew Wilcox <[email protected]> wrote:
> > > > >
> > > > > On Mon, Aug 03, 2020 at 02:47:19PM +0000, Kalesh Singh wrote:
> > > > > > +static void dma_buf_fd_install(int fd, struct file *filp)
> > > > > > +{
> > > > > > + trace_dma_buf_fd_ref_inc(current, filp);
> > > > > > +}
> > > > >
> > > > > You're adding a new file_operation in order to just add a new tracepoint?
> > > > > NACK.
> > > >
> > > > Hi Matthew,
> > > > The plan is to attach a BPF to this tracepoint in order to track
> > > > dma-buf users. If you feel this is an overkill, what would you suggest
> > > > as an alternative?
> > >
> > > I'm sure BPF can attach to fd_install and filter on file->f_ops belonging
> > > to dma_buf, for example.
> >
> > Sounds like a workable solution. Will explore that direction. Thanks Matthew!
>
> No, it is not a solution at all.
>
> What kind of locking would you use? With _any_ of those approaches.
>
> How would you use the information that is hopelessly out of date/incoherent/whatnot
> at the very moment you obtain it?
>
> IOW, what the hell is that horror for? You do realize, for example, that there's
> such thing as dup(), right? And dup2() as well. And while we are at it, how
> do you keep track of removals, considering the fact that you can stick a file
> reference into SCM_RIGHTS datagram sent to yourself, close descriptors and an hour
> later pick that datagram, suddenly getting descriptor back?
>
> Besides, "I have no descriptors left" != "I can't be currently sitting in the middle
> of syscall on that sucker"; close() does *NOT* terminate ongoing operations.
>
> You are looking at the drastically wrong abstraction level. Please, describe what
> it is that you are trying to achieve.

For added entertainment (since this is specifically about dma-buf) you
can stuff them into various gpu drivers, and convert to a native gpu
driver handle thing. That's actually the expected use case, first a
buffer sharing gets established with AF_UNIX, then both sides close
the dma-buf fd handle.

GPU drivers then internally cache the struct file so that we can hand
out the same (to avoid confusion when re-importing it on some other
driver), so for the case of dma-buf the "it's not actually an
installed fd anywhere for unlimited time" is actually the normal
use-case, not some odd corner.

Cheers, Daniel
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

2020-08-04 20:46:05

by Kalesh Singh

[permalink] [raw]
Subject: Re: [PATCH 2/2] dmabuf/tracing: Add dma-buf trace events

On Tue, Aug 04, 2020 at 07:27:24PM +0100, Al Viro wrote:
> On Tue, Aug 04, 2020 at 03:44:51PM +0000, Kalesh Singh wrote:
>
> > Hi Al. Thank you for the comments. Ultimately what we need is to identify processes
> > that hold a file reference to the dma-buf. Unfortunately we can't use only
> > explicit dma_buf_get/dma_buf_put to track them because when an FD is being shared
> > between processes the file references are taken implicitly.
> >
> > For example, on the sender side:
> > unix_dgram_sendmsg -> send_scm -> __send_scm -> scm_fp_copy -> fget_raw
> > and on the receiver side:
> > unix_dgram_recvmsg -> scm_recv -> scm_detach_fds -> __scm_install_fd -> get_file
> >
> > I understand now that fd_install is not an appropriate abstraction level to track these.
> > Is there a more appropriate alternative where we could use to track these implicit file
> > references?
>
> There is no single lock that would stabilize the descriptor tables of all
> processes. And there's not going to be one, ever - it would be a contention
> point from hell, since that would've been a system-wide lock that would have
> to be taken by *ALL* syscalls modifying any descriptor table. Not going to
> happen, for obvious reasons. Moreover, you would have to have fork(2) take
> the same lock, since it does copy descriptor table. And clone(2) either does
> the same, or has the child share the descriptor table of parent.
>
> What's more, a reference to struct file can bloody well survive without
> a single descriptor refering to that file. In the example you've mentioned
> above, sender has ever right to close all descriptors it has sent. Files
> will stay opened as long as the references are held in the datagram; when
> that datagram is received, the references will be inserted into recepient's
> descriptor table. At that point you again have descriptors refering to
> that file, can do any IO on it, etc.
>
> So "the set of processes that hold a file reference to the dma-buf" is
> * inherently unstable, unless you are willing to freeze every
> process in the system except for the one trying to find that set.
> * can remain empty for any amount of time (hours, weeks, whatever),
> only to get non-empty later, with syscalls affecting the object in question
> done afterwards.
>
> So... what were you going to do with that set if you could calculate it?
> If it's really "how do we debug a leak?", it's one thing; in that case
> I would suggest keeping track of creation/destruction of objects (not
> gaining/dropping references - actual constructors and destructors) to
> see what gets stuck around for too long and use fuser(1) to try and locate
> the culprits if you see that something *was* living for too long. "Try"
> since the only reference might indeed have been stashed into an SCM_RIGHTS
> datagram sitting in a queue of some AF_UNIX socket. Note that "fuser
> needs elevated priveleges" is not a strong argument - the ability to
> do that sort of tracking does imply elevated priveleges anyway, and
> having a root process taking requests along the lines of "gimme the
> list of PIDs that have such-and-such dma_buf in their descriptor table"
> is not much of an attack surface.
>
> If you want to use it for something else, you'll need to describe that
> intended use; there might be sane ways to do that, but it's hard to
> come up with one without knowing what's being attempted...

Hi Al. Thanks for the guidance and detailed explanation. It appears what we
were trying to accomplish here is not feasible.

Thanks, Kalesh