This corresponds to part of the functionality of the NT syscall
NtWaitForMultipleObjects(). Specifically, it implements the behaviour where
the third argument (wait_any) is TRUE, and it does not handle alertable waits.
Those features have been split out into separate patches to ease review.
NTSYNC_IOC_WAIT_ANY is a vectored wait function similar to poll(). Unlike
poll(), it "consumes" objects when they are signaled. For semaphores, this means
decreasing one from the internal counter. At most one object can be consumed by
this function.
Up to 64 objects can be waited on at once. As soon as one is signaled, the
object with the lowest index is consumed, and that index is returned via the
"index" field.
A timeout is supported. The timeout is passed as a u64 nanosecond value, which
represents absolute time measured against either the MONOTONIC or REALTIME clock
(controlled by the flags argument). If U64_MAX is passed, the ioctl waits
indefinitely.
This ioctl validates that all objects belong to the relevant device. This is not
necessary for any technical reason related to NTSYNC_IOC_WAIT_ANY, but will be
necessary for NTSYNC_IOC_WAIT_ALL introduced in the following patch.
Two u32s of padding are left in the ntsync_wait_args structure; one will be used
by a patch later in the series (which is split out to ease review).
Signed-off-by: Elizabeth Figura <[email protected]>
---
drivers/misc/ntsync.c | 250 ++++++++++++++++++++++++++++++++++++
include/uapi/linux/ntsync.h | 16 +++
2 files changed, 266 insertions(+)
diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c
index 3c2f743c58b0..c6f84a5fc8c0 100644
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -6,11 +6,16 @@
*/
#include <linux/anon_inodes.h>
+#include <linux/atomic.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/overflow.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <uapi/linux/ntsync.h>
@@ -30,6 +35,8 @@ enum ntsync_type {
*
* Both rely on struct file for reference counting. Individual
* ntsync_obj objects take a reference to the device when created.
+ * Wait operations take a reference to each object being waited on for
+ * the duration of the wait.
*/
struct ntsync_obj {
@@ -47,12 +54,56 @@ struct ntsync_obj {
__u32 max;
} sem;
} u;
+
+ struct list_head any_waiters;
+};
+
+struct ntsync_q_entry {
+ struct list_head node;
+ struct ntsync_q *q;
+ struct ntsync_obj *obj;
+ __u32 index;
+};
+
+struct ntsync_q {
+ struct task_struct *task;
+ __u32 owner;
+
+ /*
+ * Protected via atomic_try_cmpxchg(). Only the thread that wins the
+ * compare-and-swap may actually change object states and wake this
+ * task.
+ */
+ atomic_t signaled;
+
+ __u32 count;
+ struct ntsync_q_entry entries[];
};
struct ntsync_device {
struct file *file;
};
+static void try_wake_any_sem(struct ntsync_obj *sem)
+{
+ struct ntsync_q_entry *entry;
+
+ lockdep_assert_held(&sem->lock);
+
+ list_for_each_entry(entry, &sem->any_waiters, node) {
+ struct ntsync_q *q = entry->q;
+ int signaled = -1;
+
+ if (!sem->u.sem.count)
+ break;
+
+ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
+ sem->u.sem.count--;
+ wake_up_process(q->task);
+ }
+ }
+}
+
/*
* Actually change the semaphore state, returning -EOVERFLOW if it is made
* invalid.
@@ -88,6 +139,8 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
prev_count = sem->u.sem.count;
ret = post_sem_state(sem, args);
+ if (!ret)
+ try_wake_any_sem(sem);
spin_unlock(&sem->lock);
@@ -141,6 +194,7 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev,
obj->dev = dev;
get_file(dev->file);
spin_lock_init(&obj->lock);
+ INIT_LIST_HEAD(&obj->any_waiters);
return obj;
}
@@ -191,6 +245,200 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
return put_user(fd, &user_args->sem);
}
+static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
+{
+ struct file *file = fget(fd);
+ struct ntsync_obj *obj;
+
+ if (!file)
+ return NULL;
+
+ if (file->f_op != &ntsync_obj_fops) {
+ fput(file);
+ return NULL;
+ }
+
+ obj = file->private_data;
+ if (obj->dev != dev) {
+ fput(file);
+ return NULL;
+ }
+
+ return obj;
+}
+
+static void put_obj(struct ntsync_obj *obj)
+{
+ fput(obj->file);
+}
+
+static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_args *args)
+{
+ ktime_t timeout = ns_to_ktime(args->timeout);
+ clockid_t clock = CLOCK_MONOTONIC;
+ ktime_t *timeout_ptr;
+ int ret = 0;
+
+ timeout_ptr = (args->timeout == U64_MAX ? NULL : &timeout);
+
+ if (args->flags & NTSYNC_WAIT_REALTIME)
+ clock = CLOCK_REALTIME;
+
+ do {
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (atomic_read(&q->signaled) != -1) {
+ ret = 0;
+ break;
+ }
+ ret = schedule_hrtimeout_range_clock(timeout_ptr, 0, HRTIMER_MODE_ABS, clock);
+ } while (ret < 0);
+ __set_current_state(TASK_RUNNING);
+
+ return ret;
+}
+
+/*
+ * Allocate and initialize the ntsync_q structure, but do not queue us yet.
+ */
+static int setup_wait(struct ntsync_device *dev,
+ const struct ntsync_wait_args *args,
+ struct ntsync_q **ret_q)
+{
+ const __u32 count = args->count;
+ int fds[NTSYNC_MAX_WAIT_COUNT];
+ struct ntsync_q *q;
+ __u32 i, j;
+
+ if (!args->owner)
+ return -EINVAL;
+
+ if (args->pad || args->pad2 || (args->flags & ~NTSYNC_WAIT_REALTIME))
+ return -EINVAL;
+
+ if (args->count > NTSYNC_MAX_WAIT_COUNT)
+ return -EINVAL;
+
+ if (copy_from_user(fds, u64_to_user_ptr(args->objs),
+ array_size(count, sizeof(*fds))))
+ return -EFAULT;
+
+ q = kmalloc(struct_size(q, entries, count), GFP_KERNEL);
+ if (!q)
+ return -ENOMEM;
+ q->task = current;
+ q->owner = args->owner;
+ atomic_set(&q->signaled, -1);
+ q->count = count;
+
+ for (i = 0; i < count; i++) {
+ struct ntsync_q_entry *entry = &q->entries[i];
+ struct ntsync_obj *obj = get_obj(dev, fds[i]);
+
+ if (!obj)
+ goto err;
+
+ entry->obj = obj;
+ entry->q = q;
+ entry->index = i;
+ }
+
+ *ret_q = q;
+ return 0;
+
+err:
+ for (j = 0; j < i; j++)
+ put_obj(q->entries[j].obj);
+ kfree(q);
+ return -EINVAL;
+}
+
+static void try_wake_any_obj(struct ntsync_obj *obj)
+{
+ switch (obj->type) {
+ case NTSYNC_TYPE_SEM:
+ try_wake_any_sem(obj);
+ break;
+ }
+}
+
+static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp)
+{
+ struct ntsync_wait_args args;
+ struct ntsync_q *q;
+ int signaled;
+ __u32 i;
+ int ret;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ ret = setup_wait(dev, &args, &q);
+ if (ret < 0)
+ return ret;
+
+ /* queue ourselves */
+
+ for (i = 0; i < args.count; i++) {
+ struct ntsync_q_entry *entry = &q->entries[i];
+ struct ntsync_obj *obj = entry->obj;
+
+ spin_lock(&obj->lock);
+ list_add_tail(&entry->node, &obj->any_waiters);
+ spin_unlock(&obj->lock);
+ }
+
+ /* check if we are already signaled */
+
+ for (i = 0; i < args.count; i++) {
+ struct ntsync_obj *obj = q->entries[i].obj;
+
+ if (atomic_read(&q->signaled) != -1)
+ break;
+
+ spin_lock(&obj->lock);
+ try_wake_any_obj(obj);
+ spin_unlock(&obj->lock);
+ }
+
+ /* sleep */
+
+ ret = ntsync_schedule(q, &args);
+
+ /* and finally, unqueue */
+
+ for (i = 0; i < args.count; i++) {
+ struct ntsync_q_entry *entry = &q->entries[i];
+ struct ntsync_obj *obj = entry->obj;
+
+ spin_lock(&obj->lock);
+ list_del(&entry->node);
+ spin_unlock(&obj->lock);
+
+ put_obj(obj);
+ }
+
+ signaled = atomic_read(&q->signaled);
+ if (signaled != -1) {
+ struct ntsync_wait_args __user *user_args = argp;
+
+ /* even if we caught a signal, we need to communicate success */
+ ret = 0;
+
+ if (put_user(signaled, &user_args->index))
+ ret = -EFAULT;
+ } else if (!ret) {
+ ret = -ETIMEDOUT;
+ }
+
+ kfree(q);
+ return ret;
+}
+
static int ntsync_char_open(struct inode *inode, struct file *file)
{
struct ntsync_device *dev;
@@ -222,6 +470,8 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case NTSYNC_IOC_CREATE_SEM:
return ntsync_create_sem(dev, argp);
+ case NTSYNC_IOC_WAIT_ANY:
+ return ntsync_wait_any(dev, argp);
default:
return -ENOIOCTLCMD;
}
diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h
index dcfa38fdc93c..60ad414b5552 100644
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -16,7 +16,23 @@ struct ntsync_sem_args {
__u32 max;
};
+#define NTSYNC_WAIT_REALTIME 0x1
+
+struct ntsync_wait_args {
+ __u64 timeout;
+ __u64 objs;
+ __u32 count;
+ __u32 owner;
+ __u32 index;
+ __u32 flags;
+ __u32 pad;
+ __u32 pad2;
+};
+
+#define NTSYNC_MAX_WAIT_COUNT 64
+
#define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args)
+#define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args)
#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32)
--
2.43.0
On Thu, Mar 28, 2024 at 07:05:55PM -0500, Elizabeth Figura wrote:
> This corresponds to part of the functionality of the NT syscall
> NtWaitForMultipleObjects(). Specifically, it implements the behaviour where
> the third argument (wait_any) is TRUE, and it does not handle alertable waits.
> Those features have been split out into separate patches to ease review.
>
> NTSYNC_IOC_WAIT_ANY is a vectored wait function similar to poll(). Unlike
> poll(), it "consumes" objects when they are signaled. For semaphores, this means
> decreasing one from the internal counter. At most one object can be consumed by
> this function.
>
> Up to 64 objects can be waited on at once. As soon as one is signaled, the
> object with the lowest index is consumed, and that index is returned via the
> "index" field.
So it's kind of like our internal locks already? Or futex?
>
> A timeout is supported. The timeout is passed as a u64 nanosecond value, which
> represents absolute time measured against either the MONOTONIC or REALTIME clock
> (controlled by the flags argument). If U64_MAX is passed, the ioctl waits
> indefinitely.
>
> This ioctl validates that all objects belong to the relevant device. This is not
> necessary for any technical reason related to NTSYNC_IOC_WAIT_ANY, but will be
> necessary for NTSYNC_IOC_WAIT_ALL introduced in the following patch.
>
> Two u32s of padding are left in the ntsync_wait_args structure; one will be used
> by a patch later in the series (which is split out to ease review).
>
> Signed-off-by: Elizabeth Figura <[email protected]>
> ---
> drivers/misc/ntsync.c | 250 ++++++++++++++++++++++++++++++++++++
> include/uapi/linux/ntsync.h | 16 +++
> 2 files changed, 266 insertions(+)
>
> diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c
> index 3c2f743c58b0..c6f84a5fc8c0 100644
> --- a/drivers/misc/ntsync.c
> +++ b/drivers/misc/ntsync.c
> @@ -6,11 +6,16 @@
> */
>
> #include <linux/anon_inodes.h>
> +#include <linux/atomic.h>
> #include <linux/file.h>
> #include <linux/fs.h>
> +#include <linux/hrtimer.h>
> +#include <linux/ktime.h>
> #include <linux/miscdevice.h>
> #include <linux/module.h>
> #include <linux/overflow.h>
> +#include <linux/sched.h>
> +#include <linux/sched/signal.h>
> #include <linux/slab.h>
> #include <linux/spinlock.h>
> #include <uapi/linux/ntsync.h>
> @@ -30,6 +35,8 @@ enum ntsync_type {
> *
> * Both rely on struct file for reference counting. Individual
> * ntsync_obj objects take a reference to the device when created.
> + * Wait operations take a reference to each object being waited on for
> + * the duration of the wait.
> */
>
> struct ntsync_obj {
> @@ -47,12 +54,56 @@ struct ntsync_obj {
> __u32 max;
> } sem;
> } u;
> +
> + struct list_head any_waiters;
> +};
> +
> +struct ntsync_q_entry {
> + struct list_head node;
> + struct ntsync_q *q;
> + struct ntsync_obj *obj;
> + __u32 index;
> +};
> +
> +struct ntsync_q {
> + struct task_struct *task;
> + __u32 owner;
> +
> + /*
> + * Protected via atomic_try_cmpxchg(). Only the thread that wins the
> + * compare-and-swap may actually change object states and wake this
> + * task.
> + */
> + atomic_t signaled;
This feels odd, why are you duplicating a normal lock functionality
here?
> +
> + __u32 count;
> + struct ntsync_q_entry entries[];
> };
>
> struct ntsync_device {
> struct file *file;
> };
>
> +static void try_wake_any_sem(struct ntsync_obj *sem)
> +{
> + struct ntsync_q_entry *entry;
> +
> + lockdep_assert_held(&sem->lock);
> +
> + list_for_each_entry(entry, &sem->any_waiters, node) {
> + struct ntsync_q *q = entry->q;
> + int signaled = -1;
> +
> + if (!sem->u.sem.count)
> + break;
> +
> + if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
> + sem->u.sem.count--;
> + wake_up_process(q->task);
> + }
You are waking up _all_ "locks" that with the atomic_try_cmpxchg() call,
right? Not just the "first".
Or am I confused?
> + }
> +}
> +
> /*
> * Actually change the semaphore state, returning -EOVERFLOW if it is made
> * invalid.
> @@ -88,6 +139,8 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
>
> prev_count = sem->u.sem.count;
> ret = post_sem_state(sem, args);
> + if (!ret)
> + try_wake_any_sem(sem);
>
> spin_unlock(&sem->lock);
>
> @@ -141,6 +194,7 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev,
> obj->dev = dev;
> get_file(dev->file);
> spin_lock_init(&obj->lock);
> + INIT_LIST_HEAD(&obj->any_waiters);
>
> return obj;
> }
> @@ -191,6 +245,200 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
> return put_user(fd, &user_args->sem);
> }
>
> +static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
> +{
> + struct file *file = fget(fd);
> + struct ntsync_obj *obj;
> +
> + if (!file)
> + return NULL;
> +
> + if (file->f_op != &ntsync_obj_fops) {
> + fput(file);
> + return NULL;
> + }
> +
> + obj = file->private_data;
> + if (obj->dev != dev) {
> + fput(file);
> + return NULL;
> + }
> +
> + return obj;
> +}
> +
> +static void put_obj(struct ntsync_obj *obj)
> +{
> + fput(obj->file);
> +}
> +
> +static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_args *args)
> +{
> + ktime_t timeout = ns_to_ktime(args->timeout);
> + clockid_t clock = CLOCK_MONOTONIC;
> + ktime_t *timeout_ptr;
> + int ret = 0;
> +
> + timeout_ptr = (args->timeout == U64_MAX ? NULL : &timeout);
> +
> + if (args->flags & NTSYNC_WAIT_REALTIME)
> + clock = CLOCK_REALTIME;
> +
> + do {
> + if (signal_pending(current)) {
> + ret = -ERESTARTSYS;
> + break;
> + }
> +
> + set_current_state(TASK_INTERRUPTIBLE);
> + if (atomic_read(&q->signaled) != -1) {
> + ret = 0;
> + break;
What happens if the value changes right after you read it?
Rolling your own lock is tricky, and needs review from the locking
maintainers. And probably some more documentation as to what is
happening and why our normal types of locks can't be used here?
thanks,
greg k-h
On Thursday, 11 April 2024 08:34:23 CDT Greg Kroah-Hartman wrote:
> On Thu, Mar 28, 2024 at 07:05:55PM -0500, Elizabeth Figura wrote:
> > This corresponds to part of the functionality of the NT syscall
> > NtWaitForMultipleObjects(). Specifically, it implements the behaviour where
> > the third argument (wait_any) is TRUE, and it does not handle alertable waits.
> > Those features have been split out into separate patches to ease review.
> >
> > NTSYNC_IOC_WAIT_ANY is a vectored wait function similar to poll(). Unlike
> > poll(), it "consumes" objects when they are signaled. For semaphores, this means
> > decreasing one from the internal counter. At most one object can be consumed by
> > this function.
> >
> > Up to 64 objects can be waited on at once. As soon as one is signaled, the
> > object with the lowest index is consumed, and that index is returned via the
> > "index" field.
>
> So it's kind of like our internal locks already? Or futex?
Striking the right balance of explaining the problem space without
inundating the reader with information has been tricky; I'll do my best
to try to explain here.
The primitives include mutexes and semaphores, like our internal locks.
I don't really want to compare them to futexes because futexes don't
have internal state.
However NT's primitives are *way* more complicated. The big part of it
is they consume state in a way that usual wait functions don't, and as
if that weren't enough, you can do operations with them like
wait-for-all (wait for all objects to be simultaneously signaled and
atomically consume them) or pulse (signal an object without changing
its state). None of this can be expressed with poll or futex.
You can't even express those operations with wait_event() etc. We
really need to replace the entire wait queue and use schedule() +
wake_up_process() directly. ntsync_q is the wait queue struct in this.
They're also really ugly things to do; they only exist for
compatibility reasons, and retrofitting support into anything would
complicate and slow down hot paths.
> > A timeout is supported. The timeout is passed as a u64 nanosecond value, which
> > represents absolute time measured against either the MONOTONIC or REALTIME clock
> > (controlled by the flags argument). If U64_MAX is passed, the ioctl waits
> > indefinitely.
> >
> > This ioctl validates that all objects belong to the relevant device. This is not
> > necessary for any technical reason related to NTSYNC_IOC_WAIT_ANY, but will be
> > necessary for NTSYNC_IOC_WAIT_ALL introduced in the following patch.
> >
> > Two u32s of padding are left in the ntsync_wait_args structure; one will be used
> > by a patch later in the series (which is split out to ease review).
> >
> > Signed-off-by: Elizabeth Figura <[email protected]>
> > ---
> > drivers/misc/ntsync.c | 250 ++++++++++++++++++++++++++++++++++++
> > include/uapi/linux/ntsync.h | 16 +++
> > 2 files changed, 266 insertions(+)
> >
> > diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c
> > index 3c2f743c58b0..c6f84a5fc8c0 100644
> > --- a/drivers/misc/ntsync.c
> > +++ b/drivers/misc/ntsync.c
> > @@ -6,11 +6,16 @@
> > */
> >
> > #include <linux/anon_inodes.h>
> > +#include <linux/atomic.h>
> > #include <linux/file.h>
> > #include <linux/fs.h>
> > +#include <linux/hrtimer.h>
> > +#include <linux/ktime.h>
> > #include <linux/miscdevice.h>
> > #include <linux/module.h>
> > #include <linux/overflow.h>
> > +#include <linux/sched.h>
> > +#include <linux/sched/signal.h>
> > #include <linux/slab.h>
> > #include <linux/spinlock.h>
> > #include <uapi/linux/ntsync.h>
> > @@ -30,6 +35,8 @@ enum ntsync_type {
> > *
> > * Both rely on struct file for reference counting. Individual
> > * ntsync_obj objects take a reference to the device when created.
> > + * Wait operations take a reference to each object being waited on for
> > + * the duration of the wait.
> > */
> >
> > struct ntsync_obj {
> > @@ -47,12 +54,56 @@ struct ntsync_obj {
> > __u32 max;
> > } sem;
> > } u;
> > +
> > + struct list_head any_waiters;
> > +};
> > +
> > +struct ntsync_q_entry {
> > + struct list_head node;
> > + struct ntsync_q *q;
> > + struct ntsync_obj *obj;
> > + __u32 index;
> > +};
> > +
> > +struct ntsync_q {
> > + struct task_struct *task;
> > + __u32 owner;
> > +
> > + /*
> > + * Protected via atomic_try_cmpxchg(). Only the thread that wins the
> > + * compare-and-swap may actually change object states and wake this
> > + * task.
> > + */
> > + atomic_t signaled;
>
> This feels odd, why are you duplicating a normal lock functionality
> here?
ntsync_q represents a single waiter (like struct wait_queue_entry).
In short, waiting is a destructive operation; it changes the state of
the primitives waited on. If a waiter is woken successfully then it
must have consumed the state of exactly one object.
Therefore, if task A is waiting on two primitives X and Y, and those
primitives are respectively woken at the same time by tasks B and C, we
need a way to ensure that B and C don't both wake A and consume the
state of X and Y. Only one of them should win.
We could do that with a lock on the ntsync_q struct, but having a
single variable with atomic-test-and-set achieves the same thing while
being lock-free.
> > +
> > + __u32 count;
> > + struct ntsync_q_entry entries[];
> > };
> >
> > struct ntsync_device {
> > struct file *file;
> > };
> >
> > +static void try_wake_any_sem(struct ntsync_obj *sem)
> > +{
> > + struct ntsync_q_entry *entry;
> > +
> > + lockdep_assert_held(&sem->lock);
> > +
> > + list_for_each_entry(entry, &sem->any_waiters, node) {
> > + struct ntsync_q *q = entry->q;
> > + int signaled = -1;
> > +
> > + if (!sem->u.sem.count)
> > + break;
> > +
> > + if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
> > + sem->u.sem.count--;
> > + wake_up_process(q->task);
> > + }
>
> You are waking up _all_ "locks" that with the atomic_try_cmpxchg() call,
> right? Not just the "first".
>
> Or am I confused?
This is looping over all tasks trying to lock / acquire "sem", and
waking them (assuming something else didn't wake them first) while
decrementing "sem" state accordingly.
> > + }
> > +}
> > +
> > /*
> > * Actually change the semaphore state, returning -EOVERFLOW if it is made
> > * invalid.
> > @@ -88,6 +139,8 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
> >
> > prev_count = sem->u.sem.count;
> > ret = post_sem_state(sem, args);
> > + if (!ret)
> > + try_wake_any_sem(sem);
> >
> > spin_unlock(&sem->lock);
> >
> > @@ -141,6 +194,7 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev,
> > obj->dev = dev;
> > get_file(dev->file);
> > spin_lock_init(&obj->lock);
> > + INIT_LIST_HEAD(&obj->any_waiters);
> >
> > return obj;
> > }
> > @@ -191,6 +245,200 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
> > return put_user(fd, &user_args->sem);
> > }
> >
> > +static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
> > +{
> > + struct file *file = fget(fd);
> > + struct ntsync_obj *obj;
> > +
> > + if (!file)
> > + return NULL;
> > +
> > + if (file->f_op != &ntsync_obj_fops) {
> > + fput(file);
> > + return NULL;
> > + }
> > +
> > + obj = file->private_data;
> > + if (obj->dev != dev) {
> > + fput(file);
> > + return NULL;
> > + }
> > +
> > + return obj;
> > +}
> > +
> > +static void put_obj(struct ntsync_obj *obj)
> > +{
> > + fput(obj->file);
> > +}
> > +
> > +static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_args *args)
> > +{
> > + ktime_t timeout = ns_to_ktime(args->timeout);
> > + clockid_t clock = CLOCK_MONOTONIC;
> > + ktime_t *timeout_ptr;
> > + int ret = 0;
> > +
> > + timeout_ptr = (args->timeout == U64_MAX ? NULL : &timeout);
> > +
> > + if (args->flags & NTSYNC_WAIT_REALTIME)
> > + clock = CLOCK_REALTIME;
> > +
> > + do {
> > + if (signal_pending(current)) {
> > + ret = -ERESTARTSYS;
> > + break;
> > + }
> > +
> > + set_current_state(TASK_INTERRUPTIBLE);
> > + if (atomic_read(&q->signaled) != -1) {
> > + ret = 0;
> > + break;
>
> What happens if the value changes right after you read it?
The corresponding wake code flips signaled and then does
wake_up_process(), so schedule() returns immediately (and we see
q->signaled set and exit the loop.)
> Rolling your own lock is tricky, and needs review from the locking
> maintainers. And probably some more documentation as to what is
> happening and why our normal types of locks can't be used here?
Definitely. (Unfortunately this hasn't gotten attention from any locking
maintainer yet since your last call for review; not sure if there's
anything I can do there.)
Hopefully my comment at the top of this mail explains why we're rolling
our own everything, but if not please let me know and I can try to
explain more clearly.
--Zeb
On Thu, Apr 11, 2024 at 07:33:07PM -0500, Elizabeth Figura wrote:
> > Rolling your own lock is tricky, and needs review from the locking
> > maintainers. And probably some more documentation as to what is
> > happening and why our normal types of locks can't be used here?
>
> Definitely. (Unfortunately this hasn't gotten attention from any locking
> maintainer yet since your last call for review; not sure if there's
> anything I can do there.)
You only seem to have cc:ed one of the "LOCKING PRIMITIVES" maintainers
on this patchset, not all of them, which might be the reason why it has
been ignored :(
Perhaps change that for the next version of this patchset?
thanks,
greg k-h