As part of the discussions for user_events aligning to be used with eBPF
it became clear [1] we needed a way to delete events without having to rely
upon the delete IOCTL. Steven suggested that we simply have an owner
for the event, however, the event can be held by more than just the
first register FD, such as perf/ftrace or additional registers. In order
to handle all those cases, we must only delete after all references are
gone from both user and kernel space.
This series adds a new register flag, USER_EVENT_REG_PERSIST, which
causes the event to not delete itself upon the last put reference. We
cannot fully drop the delete IOCTL, since we still want to enable events to
be registered early via dynamic_events and persist. Events that do not use
this new flag are auto-cleaned up upon no longer being used.
NOTE: I'll need to merge this work once we take these [2] [3] patches
into for-next. I'm happy to do so once they land there.
1: https://lore.kernel.org/linux-trace-kernel/[email protected]/
2: https://lore.kernel.org/linux-trace-kernel/[email protected]/
3: https://lore.kernel.org/linux-trace-kernel/[email protected]/
Change history
v2:
Renamed series to "Add auto cleanup and a flag to persist events"
Changed auto-delete to be default behavior, with new flag to persist events
Beau Belgrave (5):
tracing/user_events: Store register flags on events
tracing/user_events: Track refcount consistently via put/get
tracing/user_events: Add auto cleanup and a flag to persist events
tracing/user_events: Add self-test for persist flag
tracing/user_events: Add persist flag documentation
Documentation/trace/user_events.rst | 21 +-
include/uapi/linux/user_events.h | 10 +-
kernel/trace/trace_events_user.c | 184 ++++++++++++++----
.../testing/selftests/user_events/abi_test.c | 144 +++++++++++++-
.../selftests/user_events/ftrace_test.c | 1 +
5 files changed, 309 insertions(+), 51 deletions(-)
base-commit: 3862f86c1529fa0016de6344eb974877b4cd3838
--
2.25.1
Currently we don't have any available flags for user processes to use to
indicate options for user_events. We will soon have a flag to indicate
the event should or should not auto-delete once it's not being used by
anyone.
Add a reg_flags field to user_events and parameters to existing
functions to allow for this in future patches.
Signed-off-by: Beau Belgrave <[email protected]>
---
kernel/trace/trace_events_user.c | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index b1ecd7677642..34aa0a5d8e2a 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -87,6 +87,7 @@ struct user_event {
struct list_head validators;
refcount_t refcnt;
int min_size;
+ int reg_flags;
char status;
};
@@ -163,7 +164,7 @@ typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
- struct user_event **newuser);
+ struct user_event **newuser, int reg_flags);
static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
@@ -809,7 +810,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call)
* Upon success user_event has its ref count increased by 1.
*/
static int user_event_parse_cmd(struct user_event_group *group,
- char *raw_command, struct user_event **newuser)
+ char *raw_command, struct user_event **newuser,
+ int reg_flags)
{
char *name = raw_command;
char *args = strpbrk(name, " ");
@@ -823,7 +825,7 @@ static int user_event_parse_cmd(struct user_event_group *group,
if (flags)
*flags++ = '\0';
- return user_event_parse(group, name, args, flags, newuser);
+ return user_event_parse(group, name, args, flags, newuser, reg_flags);
}
static int user_field_array_size(const char *type)
@@ -1587,7 +1589,7 @@ static int user_event_create(const char *raw_command)
mutex_lock(&group->reg_mutex);
- ret = user_event_parse_cmd(group, name, &user);
+ ret = user_event_parse_cmd(group, name, &user, 0);
if (!ret)
refcount_dec(&user->refcnt);
@@ -1748,7 +1750,7 @@ static int user_event_trace_register(struct user_event *user)
*/
static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
- struct user_event **newuser)
+ struct user_event **newuser, int reg_flags)
{
int ret;
u32 key;
@@ -1819,6 +1821,8 @@ static int user_event_parse(struct user_event_group *group, char *name,
if (ret)
goto put_user_lock;
+ user->reg_flags = reg_flags;
+
/* Ensure we track self ref and caller ref (2) */
refcount_set(&user->refcnt, 2);
@@ -2117,7 +2121,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
return ret;
}
- ret = user_event_parse_cmd(info->group, name, &user);
+ ret = user_event_parse_cmd(info->group, name, &user, reg.flags);
if (ret) {
kfree(name);
--
2.25.1
There is now a flag for user_events to use when registering events to
have events continue to exist upon the last reference put. Add the new
flag, USER_EVENT_REG_PERSIST, to user_events documentation files to let
people know when to use it.
Signed-off-by: Beau Belgrave <[email protected]>
---
Documentation/trace/user_events.rst | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst
index f79987e16cf4..6736e5a32293 100644
--- a/Documentation/trace/user_events.rst
+++ b/Documentation/trace/user_events.rst
@@ -39,6 +39,14 @@ DIAG_IOCSREG.
This command takes a packed struct user_reg as an argument::
+ enum user_reg_flag {
+ /* Event will not delete upon last reference closing */
+ USER_EVENT_REG_PERSIST = 1U << 0,
+
+ /* This value or above is currently non-ABI */
+ USER_EVENT_REG_MAX = 1U << 1,
+ };
+
struct user_reg {
/* Input: Size of the user_reg structure being used */
__u32 size;
@@ -49,7 +57,7 @@ This command takes a packed struct user_reg as an argument::
/* Input: Enable size in bytes at address */
__u8 enable_size;
- /* Input: Flags for future use, set to 0 */
+ /* Input: Flags can be any of the above user_reg_flag values */
__u16 flags;
/* Input: Address to update when enabled */
@@ -73,10 +81,13 @@ The struct user_reg requires all the above inputs to be set appropriately.
This must be 4 (32-bit) or 8 (64-bit). 64-bit values are only allowed to be
used on 64-bit kernels, however, 32-bit can be used on all kernels.
-+ flags: The flags to use, if any. For the initial version this must be 0.
- Callers should first attempt to use flags and retry without flags to ensure
- support for lower versions of the kernel. If a flag is not supported -EINVAL
- is returned.
++ flags: The flags to use, if any. Callers should first attempt to use flags
+ and retry without flags to ensure support for lower versions of the kernel.
+ If a flag is not supported -EINVAL is returned.
+
+ **USER_EVENT_REG_PERSIST**
+ When the last reference is closed for the event, the event will continue
+ to exist until a delete IOCTL is issued by a user.
+ enable_addr: The address of the value to use to reflect event status. This
must be naturally aligned and write accessible within the user program.
--
2.25.1
Currently user events need to be manually deleted via the delete IOCTL
call or via the dynamic_events file. Most operators and processes wish
to have these events auto cleanup when they are no longer used by
anything to prevent them piling without manual maintenance. However,
some operators may not want this, such as pre-registering events via the
dynamic_events tracefs file.
Add a persist flag to user facing header and honor it within the
register IOCTL call. Add max flag as well to ensure that only known
flags can be used now and in the future. Update user_event_put() to
attempt an auto delete of the event if it's the last reference. The
auto delete must run in a work queue to ensure proper behavior of
class->reg() invocations that don't expect the call to go away from
underneath them during the unregister. Add work_struct to user_event
struct to ensure we can do this reliably.
Link: https://lore.kernel.org/linux-trace-kernel/[email protected]/
Suggested-by: Steven Rostedt <[email protected]>
Signed-off-by: Beau Belgrave <[email protected]>
---
include/uapi/linux/user_events.h | 10 ++-
kernel/trace/trace_events_user.c | 118 +++++++++++++++++++++++++++----
2 files changed, 114 insertions(+), 14 deletions(-)
diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index 2984aae4a2b4..74b909e520aa 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -17,6 +17,14 @@
/* Create dynamic location entry within a 32-bit value */
#define DYN_LOC(offset, size) ((size) << 16 | (offset))
+enum user_reg_flag {
+ /* Event will not delete upon last reference closing */
+ USER_EVENT_REG_PERSIST = 1U << 0,
+
+ /* This value or above is currently non-ABI */
+ USER_EVENT_REG_MAX = 1U << 1,
+};
+
/*
* Describes an event registration and stores the results of the registration.
* This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
@@ -33,7 +41,7 @@ struct user_reg {
/* Input: Enable size in bytes at address */
__u8 enable_size;
- /* Input: Flags for future use, set to 0 */
+ /* Input: Flags can be any of the above user_reg_flag values */
__u16 flags;
/* Input: Address to update when enabled */
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 8f0fb6cb0f33..170ec2f5076c 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -85,6 +85,7 @@ struct user_event {
struct hlist_node node;
struct list_head fields;
struct list_head validators;
+ struct work_struct put_work;
refcount_t refcnt;
int min_size;
int reg_flags;
@@ -169,6 +170,7 @@ static int user_event_parse(struct user_event_group *group, char *name,
static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
static void user_event_mm_put(struct user_event_mm *mm);
+static int destroy_user_event(struct user_event *user);
static u32 user_event_key(char *name)
{
@@ -182,19 +184,98 @@ static struct user_event *user_event_get(struct user_event *user)
return user;
}
+static void delayed_destroy_user_event(struct work_struct *work)
+{
+ struct user_event *user = container_of(
+ work, struct user_event, put_work);
+
+ mutex_lock(&event_mutex);
+
+ if (!refcount_dec_and_test(&user->refcnt))
+ goto out;
+
+ if (destroy_user_event(user)) {
+ /*
+ * The only reason this would fail here is if we cannot
+ * update the visibility of the event. In this case the
+ * event stays in the hashtable, waiting for someone to
+ * attempt to delete it later.
+ */
+ pr_warn("user_events: Unable to delete event\n");
+ refcount_set(&user->refcnt, 1);
+ }
+out:
+ mutex_unlock(&event_mutex);
+}
+
static void user_event_put(struct user_event *user, bool locked)
{
-#ifdef CONFIG_LOCKDEP
- if (locked)
- lockdep_assert_held(&event_mutex);
- else
- lockdep_assert_not_held(&event_mutex);
-#endif
+ bool delete;
if (unlikely(!user))
return;
- refcount_dec(&user->refcnt);
+ /*
+ * When the event is not enabled for auto-delete there will always
+ * be at least 1 reference to the event. During the event creation
+ * we initially set the refcnt to 2 to achieve this. In those cases
+ * the caller must acquire event_mutex and after decrement check if
+ * the refcnt is 1, meaning this is the last reference. When auto
+ * delete is enabled, there will only be 1 ref, IE: refcnt will be
+ * only set to 1 during creation to allow the below checks to go
+ * through upon the last put. The last put must always be done with
+ * the event mutex held.
+ */
+ if (!locked) {
+ lockdep_assert_not_held(&event_mutex);
+ delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex);
+ } else {
+ lockdep_assert_held(&event_mutex);
+ delete = refcount_dec_and_test(&user->refcnt);
+ }
+
+ if (!delete)
+ return;
+
+ /* We now have the event_mutex in all cases */
+
+ if (user->reg_flags & USER_EVENT_REG_PERSIST) {
+ /* We should not get here when persist flag is set */
+ pr_alert("BUG: Auto-delete engaged on persistent event\n");
+ goto out;
+ }
+
+ /*
+ * Unfortunately we have to attempt the actual destroy in a work
+ * queue. This is because not all cases handle a trace_event_call
+ * being removed within the class->reg() operation for unregister.
+ */
+ INIT_WORK(&user->put_work, delayed_destroy_user_event);
+
+ /*
+ * Since the event is still in the hashtable, we have to re-inc
+ * the ref count to 1. This count will be decremented and checked
+ * in the work queue to ensure it's still the last ref. This is
+ * needed because a user-process could register the same event in
+ * between the time of event_mutex release and the work queue
+ * running the delayed destroy. If we removed the item now from
+ * the hashtable, this would result in a timing window where a
+ * user process would fail a register because the trace_event_call
+ * register would fail in the tracing layers.
+ */
+ refcount_set(&user->refcnt, 1);
+
+ if (!schedule_work(&user->put_work)) {
+ /*
+ * If we fail we must wait for an admin to attempt delete or
+ * another register/close of the event, whichever is first.
+ */
+ pr_warn("user_events: Unable to queue delayed destroy\n");
+ }
+out:
+ /* Ensure if we didn't have event_mutex before we unlock it */
+ if (!locked)
+ mutex_unlock(&event_mutex);
}
static void user_event_group_destroy(struct user_event_group *group)
@@ -793,7 +874,12 @@ static struct user_event_enabler
static __always_inline __must_check
bool user_event_last_ref(struct user_event *user)
{
- return refcount_read(&user->refcnt) == 1;
+ int last = 0;
+
+ if (user->reg_flags & USER_EVENT_REG_PERSIST)
+ last = 1;
+
+ return refcount_read(&user->refcnt) == last;
}
static __always_inline __must_check
@@ -1609,7 +1695,8 @@ static int user_event_create(const char *raw_command)
mutex_lock(&group->reg_mutex);
- ret = user_event_parse_cmd(group, name, &user, 0);
+ /* Dyn events persist, otherwise they would cleanup immediately */
+ ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST);
if (!ret)
user_event_put(user, false);
@@ -1843,8 +1930,13 @@ static int user_event_parse(struct user_event_group *group, char *name,
user->reg_flags = reg_flags;
- /* Ensure we track self ref and caller ref (2) */
- refcount_set(&user->refcnt, 2);
+ if (user->reg_flags & USER_EVENT_REG_PERSIST) {
+ /* Ensure we track self ref and caller ref (2) */
+ refcount_set(&user->refcnt, 2);
+ } else {
+ /* Ensure we track only caller ref (1) */
+ refcount_set(&user->refcnt, 1);
+ }
dyn_event_init(&user->devent, &user_event_dops);
dyn_event_add(&user->devent, &user->call);
@@ -2066,8 +2158,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
if (ret)
return ret;
- /* Ensure no flags, since we don't support any yet */
- if (kreg->flags != 0)
+ /* Ensure only valid flags */
+ if (kreg->flags & ~(USER_EVENT_REG_MAX-1))
return -EINVAL;
/* Ensure supported size */
--
2.25.1
Various parts of the code today track user_event's refcnt field directly
via a refcount_add/dec. This makes it hard to modify the behavior of the
last reference decrement in all code paths consistently. For example, in
the future we will auto-delete events upon the last reference going
away. This last reference could happen in many places, but we want it to
be consistently handled.
Add user_event_get() and user_event_put() for the add/dec. Update all
places where direct refcounts are being used to utilize these new
functions. In each location pass if event_mutex is locked or not. This
allows us to drop events automatically in future patches clearly. Ensure
when caller states the lock is held, it really is (or is not) held.
Signed-off-by: Beau Belgrave <[email protected]>
---
kernel/trace/trace_events_user.c | 66 +++++++++++++++++++-------------
1 file changed, 40 insertions(+), 26 deletions(-)
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 34aa0a5d8e2a..8f0fb6cb0f33 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -175,6 +175,28 @@ static u32 user_event_key(char *name)
return jhash(name, strlen(name), 0);
}
+static struct user_event *user_event_get(struct user_event *user)
+{
+ refcount_inc(&user->refcnt);
+
+ return user;
+}
+
+static void user_event_put(struct user_event *user, bool locked)
+{
+#ifdef CONFIG_LOCKDEP
+ if (locked)
+ lockdep_assert_held(&event_mutex);
+ else
+ lockdep_assert_not_held(&event_mutex);
+#endif
+
+ if (unlikely(!user))
+ return;
+
+ refcount_dec(&user->refcnt);
+}
+
static void user_event_group_destroy(struct user_event_group *group)
{
kfree(group->system_name);
@@ -258,12 +280,13 @@ static struct user_event_group
return NULL;
};
-static void user_event_enabler_destroy(struct user_event_enabler *enabler)
+static void user_event_enabler_destroy(struct user_event_enabler *enabler,
+ bool locked)
{
list_del_rcu(&enabler->link);
/* No longer tracking the event via the enabler */
- refcount_dec(&enabler->event->refcnt);
+ user_event_put(enabler->event, locked);
kfree(enabler);
}
@@ -325,7 +348,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work)
/* User asked for enabler to be removed during fault */
if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) {
- user_event_enabler_destroy(enabler);
+ user_event_enabler_destroy(enabler, true);
goto out;
}
@@ -489,13 +512,12 @@ static bool user_event_enabler_dup(struct user_event_enabler *orig,
if (!enabler)
return false;
- enabler->event = orig->event;
+ enabler->event = user_event_get(orig->event);
enabler->addr = orig->addr;
/* Only dup part of value (ignore future flags, etc) */
enabler->values = orig->values & ENABLE_VAL_DUP_MASK;
- refcount_inc(&enabler->event->refcnt);
list_add_rcu(&enabler->link, &mm->enablers);
return true;
@@ -595,7 +617,7 @@ static void user_event_mm_destroy(struct user_event_mm *mm)
struct user_event_enabler *enabler, *next;
list_for_each_entry_safe(enabler, next, &mm->enablers, link)
- user_event_enabler_destroy(enabler);
+ user_event_enabler_destroy(enabler, false);
mmdrop(mm->mm);
kfree(mm);
@@ -748,7 +770,7 @@ static struct user_event_enabler
* exit or run exec(), which includes forks and clones.
*/
if (!*write_result) {
- refcount_inc(&enabler->event->refcnt);
+ user_event_get(user);
list_add_rcu(&enabler->link, &user_mm->enablers);
}
@@ -1336,10 +1358,8 @@ static struct user_event *find_user_event(struct user_event_group *group,
*outkey = key;
hash_for_each_possible(group->register_table, user, node, key)
- if (!strcmp(EVENT_NAME(user), name)) {
- refcount_inc(&user->refcnt);
- return user;
- }
+ if (!strcmp(EVENT_NAME(user), name))
+ return user_event_get(user);
return NULL;
}
@@ -1553,12 +1573,12 @@ static int user_event_reg(struct trace_event_call *call,
return ret;
inc:
- refcount_inc(&user->refcnt);
+ user_event_get(user);
update_enable_bit_for(user);
return 0;
dec:
update_enable_bit_for(user);
- refcount_dec(&user->refcnt);
+ user_event_put(user, true);
return 0;
}
@@ -1592,7 +1612,7 @@ static int user_event_create(const char *raw_command)
ret = user_event_parse_cmd(group, name, &user, 0);
if (!ret)
- refcount_dec(&user->refcnt);
+ user_event_put(user, false);
mutex_unlock(&group->reg_mutex);
@@ -1856,7 +1876,7 @@ static int delete_user_event(struct user_event_group *group, char *name)
if (!user)
return -ENOENT;
- refcount_dec(&user->refcnt);
+ user_event_put(user, true);
if (!user_event_last_ref(user))
return -EBUSY;
@@ -2015,9 +2035,7 @@ static int user_events_ref_add(struct user_event_file_info *info,
for (i = 0; i < count; ++i)
new_refs->events[i] = refs->events[i];
- new_refs->events[i] = user;
-
- refcount_inc(&user->refcnt);
+ new_refs->events[i] = user_event_get(user);
rcu_assign_pointer(info->refs, new_refs);
@@ -2131,7 +2149,7 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
ret = user_events_ref_add(info, user);
/* No longer need parse ref, ref_add either worked or not */
- refcount_dec(&user->refcnt);
+ user_event_put(user, false);
/* Positive number is index and valid */
if (ret < 0)
@@ -2280,7 +2298,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
- user_event_enabler_destroy(enabler);
+ user_event_enabler_destroy(enabler, true);
/* Removed at least one */
ret = 0;
@@ -2337,7 +2355,6 @@ static int user_events_release(struct inode *node, struct file *file)
struct user_event_file_info *info = file->private_data;
struct user_event_group *group;
struct user_event_refs *refs;
- struct user_event *user;
int i;
if (!info)
@@ -2361,12 +2378,9 @@ static int user_events_release(struct inode *node, struct file *file)
* The underlying user_events are ref counted, and cannot be freed.
* After this decrement, the user_events may be freed elsewhere.
*/
- for (i = 0; i < refs->count; ++i) {
- user = refs->events[i];
+ for (i = 0; i < refs->count; ++i)
+ user_event_put(refs->events[i], false);
- if (user)
- refcount_dec(&user->refcnt);
- }
out:
file->private_data = NULL;
--
2.25.1
A new flag for persisting user_events upon the last reference exists
now. We must ensure this flag works correctly in the common cases.
Update abi self test to ensure when this flag is used the user_event
goes away at the appropriate time. Ensure last fd, enabler, and
trace_event_call refs paths correctly delete the event for non-persist
events.
Signed-off-by: Beau Belgrave <[email protected]>
---
.../testing/selftests/user_events/abi_test.c | 144 +++++++++++++++++-
.../selftests/user_events/ftrace_test.c | 1 +
2 files changed, 137 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c
index 5125c42efe65..55aaec21fd8e 100644
--- a/tools/testing/selftests/user_events/abi_test.c
+++ b/tools/testing/selftests/user_events/abi_test.c
@@ -22,10 +22,61 @@
const char *data_file = "/sys/kernel/tracing/user_events_data";
const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable";
+const char *temp_enable_file = "/sys/kernel/tracing/events/user_events/__abi_temp_event/enable";
-static int change_event(bool enable)
+static bool __exists(int grace_ms, const char *path)
{
- int fd = open(enable_file, O_RDWR);
+ int fd;
+
+ usleep(grace_ms * 1000);
+
+ fd = open(path, O_RDONLY);
+
+ if (fd == -1)
+ return false;
+
+ close(fd);
+
+ return true;
+}
+
+static bool temp_exists(int grace_ms)
+{
+ return __exists(grace_ms, temp_enable_file);
+}
+
+static bool exists(int grace_ms)
+{
+ return __exists(grace_ms, enable_file);
+}
+
+static int __clear(const char *name)
+{
+ int fd = open(data_file, O_RDWR);
+ int ret = 0;
+
+ if (ioctl(fd, DIAG_IOCSDEL, name) == -1)
+ if (errno != ENOENT)
+ ret = -1;
+
+ close(fd);
+
+ return ret;
+}
+
+static int clear_temp(void)
+{
+ return __clear("__abi_temp_event");
+}
+
+static int clear(void)
+{
+ return __clear("__abi_event");
+}
+
+static int __change_event(const char *path, bool enable)
+{
+ int fd = open(path, O_RDWR);
int ret;
if (fd < 0)
@@ -46,22 +97,48 @@ static int change_event(bool enable)
return ret;
}
-static int reg_enable(long *enable, int size, int bit)
+static int change_temp_event(bool enable)
+{
+ return __change_event(temp_enable_file, enable);
+}
+
+static int change_event(bool enable)
+{
+ return __change_event(enable_file, enable);
+}
+
+static int __reg_enable(int *fd, const char *name, long *enable, int size,
+ int bit, int flags)
{
struct user_reg reg = {0};
- int fd = open(data_file, O_RDWR);
- int ret;
- if (fd < 0)
+ *fd = open(data_file, O_RDWR);
+
+ if (*fd < 0)
return -1;
reg.size = sizeof(reg);
- reg.name_args = (__u64)"__abi_event";
+ reg.name_args = (__u64)name;
reg.enable_bit = bit;
reg.enable_addr = (__u64)enable;
reg.enable_size = size;
+ reg.flags = flags;
+
+ return ioctl(*fd, DIAG_IOCSREG, ®);
+}
- ret = ioctl(fd, DIAG_IOCSREG, ®);
+static int reg_enable_temp(int *fd, long *enable, int size, int bit)
+{
+ return __reg_enable(fd, "__abi_temp_event", enable, size, bit, 0);
+}
+
+static int reg_enable(long *enable, int size, int bit)
+{
+ int ret;
+ int fd;
+
+ ret = __reg_enable(&fd, "__abi_event", enable, size, bit,
+ USER_EVENT_REG_PERSIST);
close(fd);
@@ -98,6 +175,8 @@ FIXTURE_SETUP(user) {
}
FIXTURE_TEARDOWN(user) {
+ clear();
+ clear_temp();
}
TEST_F(user, enablement) {
@@ -223,6 +302,55 @@ TEST_F(user, clones) {
ASSERT_EQ(0, change_event(false));
}
+TEST_F(user, flags) {
+ int grace = 100;
+ int fd;
+
+ /* FLAG: None */
+ /* Removal path 1, close on last fd ref */
+ ASSERT_EQ(0, clear_temp());
+ ASSERT_EQ(0, reg_enable_temp(&fd, &self->check, sizeof(int), 0));
+ ASSERT_EQ(0, reg_disable(&self->check, 0));
+ close(fd);
+ ASSERT_EQ(false, temp_exists(grace));
+
+ /* Removal path 2, close on last enabler */
+ ASSERT_EQ(0, clear_temp());
+ ASSERT_EQ(0, reg_enable_temp(&fd, &self->check, sizeof(int), 0));
+ close(fd);
+ ASSERT_EQ(true, temp_exists(grace));
+ ASSERT_EQ(0, reg_disable(&self->check, 0));
+ ASSERT_EQ(false, temp_exists(grace));
+
+ /* Removal path 3, close on last trace_event ref */
+ ASSERT_EQ(0, clear_temp());
+ ASSERT_EQ(0, reg_enable_temp(&fd, &self->check, sizeof(int), 0));
+ ASSERT_EQ(0, reg_disable(&self->check, 0));
+ ASSERT_EQ(0, change_temp_event(true));
+ close(fd);
+ ASSERT_EQ(true, temp_exists(grace));
+ ASSERT_EQ(0, change_temp_event(false));
+ ASSERT_EQ(false, temp_exists(grace));
+
+ /* FLAG: USER_EVENT_REG_PERSIST */
+ ASSERT_EQ(0, clear());
+ ASSERT_EQ(0, reg_enable(&self->check, sizeof(int), 0));
+ ASSERT_EQ(0, reg_disable(&self->check, 0));
+ ASSERT_EQ(true, exists(grace));
+ ASSERT_EQ(0, clear());
+ ASSERT_EQ(false, exists(grace));
+
+ /* FLAG: Non-ABI */
+ /* Unknown flags should fail with EINVAL */
+ ASSERT_EQ(-1, __reg_enable(&fd, "__abi_invalid_event", &self->check,
+ sizeof(int), 0, USER_EVENT_REG_MAX));
+ ASSERT_EQ(EINVAL, errno);
+
+ ASSERT_EQ(-1, __reg_enable(&fd, "__abi_invalid_event", &self->check,
+ sizeof(int), 0, USER_EVENT_REG_MAX + 1));
+ ASSERT_EQ(EINVAL, errno);
+}
+
int main(int argc, char **argv)
{
return test_harness_run(argc, argv);
diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c
index 7c99cef94a65..e5e966d77918 100644
--- a/tools/testing/selftests/user_events/ftrace_test.c
+++ b/tools/testing/selftests/user_events/ftrace_test.c
@@ -210,6 +210,7 @@ TEST_F(user, register_events) {
reg.enable_bit = 31;
reg.enable_addr = (__u64)&self->check;
reg.enable_size = sizeof(self->check);
+ reg.flags = USER_EVENT_REG_PERSIST;
unreg.size = sizeof(unreg);
unreg.disable_bit = 31;
--
2.25.1
On Mon, Jun 5, 2023 at 4:39 PM Beau Belgrave <[email protected]> wrote:
> + /*
> + * When the event is not enabled for auto-delete there will always
> + * be at least 1 reference to the event. During the event creation
> + * we initially set the refcnt to 2 to achieve this. In those cases
> + * the caller must acquire event_mutex and after decrement check if
> + * the refcnt is 1, meaning this is the last reference. When auto
> + * delete is enabled, there will only be 1 ref, IE: refcnt will be
> + * only set to 1 during creation to allow the below checks to go
> + * through upon the last put. The last put must always be done with
> + * the event mutex held.
> + */
> + if (!locked) {
> + lockdep_assert_not_held(&event_mutex);
> + delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex);
> + } else {
> + lockdep_assert_held(&event_mutex);
> + delete = refcount_dec_and_test(&user->refcnt);
> + }
> +
> + if (!delete)
> + return;
> +
> + /* We now have the event_mutex in all cases */
> +
> + if (user->reg_flags & USER_EVENT_REG_PERSIST) {
> + /* We should not get here when persist flag is set */
> + pr_alert("BUG: Auto-delete engaged on persistent event\n");
> + goto out;
> + }
> +
> + /*
> + * Unfortunately we have to attempt the actual destroy in a work
> + * queue. This is because not all cases handle a trace_event_call
> + * being removed within the class->reg() operation for unregister.
> + */
> + INIT_WORK(&user->put_work, delayed_destroy_user_event);
> +
> + /*
> + * Since the event is still in the hashtable, we have to re-inc
> + * the ref count to 1. This count will be decremented and checked
> + * in the work queue to ensure it's still the last ref. This is
> + * needed because a user-process could register the same event in
> + * between the time of event_mutex release and the work queue
> + * running the delayed destroy. If we removed the item now from
> + * the hashtable, this would result in a timing window where a
> + * user process would fail a register because the trace_event_call
> + * register would fail in the tracing layers.
> + */
> + refcount_set(&user->refcnt, 1);
The recnt-ing scheme is quite unorthodox.
Atomically decrementing it to zero and then immediately set it back to 1?
Smells racy.
Another process can go through the same code and do another dec and set to 1
and we'll have two work queued?
Will mutex_lock help somehow? If yes, then why atomic refcnt?
On Tue, Jun 06, 2023 at 06:26:56PM -0700, Alexei Starovoitov wrote:
> On Mon, Jun 5, 2023 at 4:39 PM Beau Belgrave <[email protected]> wrote:
> > + /*
> > + * When the event is not enabled for auto-delete there will always
> > + * be at least 1 reference to the event. During the event creation
> > + * we initially set the refcnt to 2 to achieve this. In those cases
> > + * the caller must acquire event_mutex and after decrement check if
> > + * the refcnt is 1, meaning this is the last reference. When auto
> > + * delete is enabled, there will only be 1 ref, IE: refcnt will be
> > + * only set to 1 during creation to allow the below checks to go
> > + * through upon the last put. The last put must always be done with
> > + * the event mutex held.
> > + */
> > + if (!locked) {
> > + lockdep_assert_not_held(&event_mutex);
> > + delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex);
> > + } else {
> > + lockdep_assert_held(&event_mutex);
> > + delete = refcount_dec_and_test(&user->refcnt);
> > + }
> > +
> > + if (!delete)
> > + return;
> > +
> > + /* We now have the event_mutex in all cases */
> > +
> > + if (user->reg_flags & USER_EVENT_REG_PERSIST) {
> > + /* We should not get here when persist flag is set */
> > + pr_alert("BUG: Auto-delete engaged on persistent event\n");
> > + goto out;
> > + }
> > +
> > + /*
> > + * Unfortunately we have to attempt the actual destroy in a work
> > + * queue. This is because not all cases handle a trace_event_call
> > + * being removed within the class->reg() operation for unregister.
> > + */
> > + INIT_WORK(&user->put_work, delayed_destroy_user_event);
> > +
> > + /*
> > + * Since the event is still in the hashtable, we have to re-inc
> > + * the ref count to 1. This count will be decremented and checked
> > + * in the work queue to ensure it's still the last ref. This is
> > + * needed because a user-process could register the same event in
> > + * between the time of event_mutex release and the work queue
> > + * running the delayed destroy. If we removed the item now from
> > + * the hashtable, this would result in a timing window where a
> > + * user process would fail a register because the trace_event_call
> > + * register would fail in the tracing layers.
> > + */
> > + refcount_set(&user->refcnt, 1);
>
> The recnt-ing scheme is quite unorthodox.
Yes, it's unfortunately because we have to keep the event in the hashtable.
Typically we'd just remove the event from the hashtable, ref_dec and
then upon final ref_dec free it. The problem with that is the event in
the hashtable is an actual trace_event exposed via tracefs/perf. It
prevents us from removing it at this time as the comment tries to
explain.
> Atomically decrementing it to zero and then immediately set it back to 1?
> Smells racy.
Thanks for pointing this out, I likely need another comment in the code
explaining why this is ok.
It might smell that way :) But the only way once it's zero to get
another reference is by acquiring event_mutex and then calling
find_user_event(). This is why it's important that at this phase we hold
the event_mutex in all cases.
> Another process can go through the same code and do another dec and set to 1
> and we'll have two work queued?
Once we set it to 1, we cannot get into this code path again until the
work that was queued is finished. The scheduled work acquires
event_mutex and ensures it's still the last reference before doing any
actual deletion or a refcount_dec that would allow for another work
queue via this path.
> Will mutex_lock help somehow? If yes, then why atomic refcnt?
I hopefully fully explained the mutex_lock above, if it's confusing
still let me know. The rason for the atomic refcnt though is because you
initially get a reference to a user_event via find_user_event() under
the lock, but then each FD (and each enabler) has a reference to the
underlying user_event (and thus the trace_event). Fork() and enablement
registers are done outside of this lock for performance reasons. When
the task exits, we also close down the enablers for that mm/task, and so
the lock is not there either (and why having the less used
refcount_dec_and_mutex_lock being required on put when it's not
acquired).
Thanks,
-Beau
On Mon, 5 Jun 2023 16:38:58 -0700
Beau Belgrave <[email protected]> wrote:
> Currently user events need to be manually deleted via the delete IOCTL
> call or via the dynamic_events file. Most operators and processes wish
> to have these events auto cleanup when they are no longer used by
> anything to prevent them piling without manual maintenance. However,
> some operators may not want this, such as pre-registering events via the
> dynamic_events tracefs file.
>
> Add a persist flag to user facing header and honor it within the
> register IOCTL call. Add max flag as well to ensure that only known
> flags can be used now and in the future. Update user_event_put() to
> attempt an auto delete of the event if it's the last reference. The
> auto delete must run in a work queue to ensure proper behavior of
> class->reg() invocations that don't expect the call to go away from
> underneath them during the unregister. Add work_struct to user_event
> struct to ensure we can do this reliably.
>
> Link: https://lore.kernel.org/linux-trace-kernel/[email protected]/
Since there still seems to be some controversy over the persistent events,
could we hold off on implementing them until next merge window? That is, I
would like to only have the fd owned events for this release, which would
give us time to hash out any of the issues with persistent events.
If they are not in, then they are not an API, but once they are in, then we
are stuck with them. I believe everyone is fine with the fd owned events,
right?
>
> Suggested-by: Steven Rostedt <[email protected]>
> Signed-off-by: Beau Belgrave <[email protected]>
> ---
> include/uapi/linux/user_events.h | 10 ++-
> kernel/trace/trace_events_user.c | 118 +++++++++++++++++++++++++++----
> 2 files changed, 114 insertions(+), 14 deletions(-)
>
>
That is we can keep all the code of this patch, but:
> static __always_inline __must_check
> @@ -1609,7 +1695,8 @@ static int user_event_create(const char *raw_command)
>
> mutex_lock(&group->reg_mutex);
>
> - ret = user_event_parse_cmd(group, name, &user, 0);
> + /* Dyn events persist, otherwise they would cleanup immediately */
> + ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST);
>
> if (!ret)
> user_event_put(user, false);
> @@ -1843,8 +1930,13 @@ static int user_event_parse(struct user_event_group *group, char *name,
>
Add here:
if (reg_flags) {
/* Holding off implementing PERSIST events */
ret = -EINVAL;
goto put_user_lock;
}
Which also reminds me. We should return EINVAL if any flags that we do not
know about is set. Otherwise when we do implement new flags, the user will
not know if they are taking effect or not.
-- Steve
> user->reg_flags = reg_flags;
>
> - /* Ensure we track self ref and caller ref (2) */
> - refcount_set(&user->refcnt, 2);
> + if (user->reg_flags & USER_EVENT_REG_PERSIST) {
> + /* Ensure we track self ref and caller ref (2) */
> + refcount_set(&user->refcnt, 2);
> + } else {
> + /* Ensure we track only caller ref (1) */
> + refcount_set(&user->refcnt, 1);
> + }
>
> dyn_event_init(&user->devent, &user_event_dops);
> dyn_event_add(&user->devent, &user->call);
> @@ -2066,8 +2158,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
> if (ret)
> return ret;
>
> - /* Ensure no flags, since we don't support any yet */
> - if (kreg->flags != 0)
> + /* Ensure only valid flags */
> + if (kreg->flags & ~(USER_EVENT_REG_MAX-1))
> return -EINVAL;
>
> /* Ensure supported size */
On Thu, Jun 08, 2023 at 04:25:57PM -0400, Steven Rostedt wrote:
> On Mon, 5 Jun 2023 16:38:58 -0700
> Beau Belgrave <[email protected]> wrote:
>
> > Currently user events need to be manually deleted via the delete IOCTL
> > call or via the dynamic_events file. Most operators and processes wish
> > to have these events auto cleanup when they are no longer used by
> > anything to prevent them piling without manual maintenance. However,
> > some operators may not want this, such as pre-registering events via the
> > dynamic_events tracefs file.
> >
> > Add a persist flag to user facing header and honor it within the
> > register IOCTL call. Add max flag as well to ensure that only known
> > flags can be used now and in the future. Update user_event_put() to
> > attempt an auto delete of the event if it's the last reference. The
> > auto delete must run in a work queue to ensure proper behavior of
> > class->reg() invocations that don't expect the call to go away from
> > underneath them during the unregister. Add work_struct to user_event
> > struct to ensure we can do this reliably.
> >
> > Link: https://lore.kernel.org/linux-trace-kernel/[email protected]/
>
> Since there still seems to be some controversy over the persistent events,
> could we hold off on implementing them until next merge window? That is, I
> would like to only have the fd owned events for this release, which would
> give us time to hash out any of the issues with persistent events.
>
> If they are not in, then they are not an API, but once they are in, then we
> are stuck with them. I believe everyone is fine with the fd owned events,
> right?
>
I am fine with this approach, however, FD owned events only means that
anyone using the /sys/kernel/tracing/dynamic_events will have the event
deleted immediately.
Should we flat out issue an error instead of having it work, but then
removed immediately?
NOTE:
I'm waiting for the other user_event patches to land in the tracing
for-next since there will be conflicts and I want to make sure I get
good coverage with catching all the put/get refs.
> >
> > Suggested-by: Steven Rostedt <[email protected]>
> > Signed-off-by: Beau Belgrave <[email protected]>
> > ---
> > include/uapi/linux/user_events.h | 10 ++-
> > kernel/trace/trace_events_user.c | 118 +++++++++++++++++++++++++++----
> > 2 files changed, 114 insertions(+), 14 deletions(-)
> >
> >
>
> That is we can keep all the code of this patch, but:
>
> > static __always_inline __must_check
> > @@ -1609,7 +1695,8 @@ static int user_event_create(const char *raw_command)
> >
> > mutex_lock(&group->reg_mutex);
> >
> > - ret = user_event_parse_cmd(group, name, &user, 0);
> > + /* Dyn events persist, otherwise they would cleanup immediately */
> > + ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST);
> >
> > if (!ret)
> > user_event_put(user, false);
> > @@ -1843,8 +1930,13 @@ static int user_event_parse(struct user_event_group *group, char *name,
> >
>
> Add here:
>
> if (reg_flags) {
> /* Holding off implementing PERSIST events */
> ret = -EINVAL;
> goto put_user_lock;
> }
>
> Which also reminds me. We should return EINVAL if any flags that we do not
> know about is set. Otherwise when we do implement new flags, the user will
> not know if they are taking effect or not.
>
We do that in user_reg_get, but point taken, some flag could come
through dynamic_events interface and then we'd miss it.
Thanks,
-Beau
> -- Steve
>
>
> > user->reg_flags = reg_flags;
> >
> > - /* Ensure we track self ref and caller ref (2) */
> > - refcount_set(&user->refcnt, 2);
> > + if (user->reg_flags & USER_EVENT_REG_PERSIST) {
> > + /* Ensure we track self ref and caller ref (2) */
> > + refcount_set(&user->refcnt, 2);
> > + } else {
> > + /* Ensure we track only caller ref (1) */
> > + refcount_set(&user->refcnt, 1);
> > + }
> >
> > dyn_event_init(&user->devent, &user_event_dops);
> > dyn_event_add(&user->devent, &user->call);
> > @@ -2066,8 +2158,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
> > if (ret)
> > return ret;
> >
> > - /* Ensure no flags, since we don't support any yet */
> > - if (kreg->flags != 0)
> > + /* Ensure only valid flags */
> > + if (kreg->flags & ~(USER_EVENT_REG_MAX-1))
> > return -EINVAL;
> >
> > /* Ensure supported size */
On Mon, 5 Jun 2023 16:38:58 -0700
Beau Belgrave <[email protected]> wrote:
Continuing on what Alexei was saying ...
> + /*
> + * Unfortunately we have to attempt the actual destroy in a work
> + * queue. This is because not all cases handle a trace_event_call
> + * being removed within the class->reg() operation for unregister.
> + */
> + INIT_WORK(&user->put_work, delayed_destroy_user_event);
We initialize the work here.
> +
> + /*
> + * Since the event is still in the hashtable, we have to re-inc
> + * the ref count to 1. This count will be decremented and checked
> + * in the work queue to ensure it's still the last ref. This is
> + * needed because a user-process could register the same event in
> + * between the time of event_mutex release and the work queue
> + * running the delayed destroy. If we removed the item now from
> + * the hashtable, this would result in a timing window where a
> + * user process would fail a register because the trace_event_call
> + * register would fail in the tracing layers.
> + */
> + refcount_set(&user->refcnt, 1);
> +
> + if (!schedule_work(&user->put_work)) {
From what I understand, schedule_work() can only fail if the work is
already queued. That should never happen if we just initialized it.
That means we need a WARN_ON_ONCE() here. Because it's a major bug if that
does return false.
-- Steve
> + /*
> + * If we fail we must wait for an admin to attempt delete or
> + * another register/close of the event, whichever is first.
> + */
> + pr_warn("user_events: Unable to queue delayed destroy\n");
> + }