Hi all,
this patch adds workqueue based fsync offload. Version of this
patch have been floating around for a couple years, but we now
have a user with seastar used by ScyllaDB (who sponsored this
work) that really wants this in addition to the aio poll support.
More details are in the patch itself.
Because the iocb types have been defined sine day one (and probably
were supported by RHEL3) libaio already supports these calls as-is.
This also pulls in the aio cleanups and io_pgetevents support previously
submitted and review as part of the aio poll series. The aio poll
series will be resubmitted on top of this series
A git tree is available here:
git://git.infradead.org/users/hch/vfs.git aio-fsync.4
Gitweb:
http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/aio-fsync.4
Changes since V3:
- rebased on top of 4.17-rc3
- improved/added a few comments
Changes since V2:
- don't introduce a use after free for lockdep sb release tracking
- set up list for cancellation before I/O submission
Changes since V1:
- remove a BUG_ON_ONE(is_sync_kiocb(kiocb));
- moved cancellation patches to the poll series
- improve a list_empty check
The page size is in no way related to the aio code, and printing it in
the (debug) dmesg at every boot serves no purpose.
Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Jeff Moyer <[email protected]>
Reviewed-by: Greg Kroah-Hartman <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
---
fs/aio.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 88d7927ffbc6..add46b06be86 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -264,9 +264,6 @@ static int __init aio_setup(void)
kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-
- pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
-
return 0;
}
__initcall(aio_setup);
--
2.17.0
If we release the lockdep write protection token before calling into
->write_iter and thus never access the file pointer after an -EIOCBQUEUED
return from ->write_iter or ->read_iter we don't need this extra
reference.
Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/aio.c | 25 ++++++++++++++++---------
1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 18507743757a..e5866e2fc331 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1515,16 +1515,19 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) {
- req->ki_flags |= IOCB_WRITE;
- file_start_write(file);
- ret = aio_ret(req, call_write_iter(file, req, &iter));
/*
- * We release freeze protection in aio_complete(). Fool lockdep
- * by telling it the lock got released so that it doesn't
- * complain about held lock when we return to userspace.
+ * Open-code file_start_write here to grab freeze protection,
+ * which will be released by another thread in aio_complete().
+ * Fool lockdep by telling it the lock got released so that it
+ * doesn't complain about the held lock when we return to
+ * userspace.
*/
- if (S_ISREG(file_inode(file)->i_mode))
+ if (S_ISREG(file_inode(file)->i_mode)) {
+ __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+ }
+ req->ki_flags |= IOCB_WRITE;
+ ret = aio_ret(req, call_write_iter(file, req, &iter));
}
kfree(iovec);
return ret;
@@ -1599,7 +1602,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->ki_user_iocb = user_iocb;
req->ki_user_data = iocb->aio_data;
- get_file(file);
switch (iocb->aio_lio_opcode) {
case IOCB_CMD_PREAD:
ret = aio_read(&req->common, iocb, false, compat);
@@ -1618,8 +1620,13 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
ret = -EINVAL;
break;
}
- fput(file);
+ /*
+ * If ret is -EIOCBQUEUED, ownership of the file reference acquired
+ * above passed to the file system, which at this point might have
+ * dropped the reference, so we must be careful to not reference it
+ * once we have called into the file system.
+ */
if (ret && ret != -EIOCBQUEUED)
goto out_put_req;
return 0;
--
2.17.0
Don't reference the kiocb structure from the common aio code, and move
any use of it into helper specific to the read/write path. This is in
preparation for aio_poll support that wants to use the space for different
fields.
Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Jeff Moyer <[email protected]>
Reviewed-by: Greg Kroah-Hartman <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
---
fs/aio.c | 161 +++++++++++++++++++++++++++++++------------------------
1 file changed, 92 insertions(+), 69 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index e5866e2fc331..d07c27b3fe88 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -170,7 +170,9 @@ struct kioctx {
#define KIOCB_CANCELLED ((void *) (~0ULL))
struct aio_kiocb {
- struct kiocb common;
+ union {
+ struct kiocb rw;
+ };
struct kioctx *ki_ctx;
kiocb_cancel_fn *ki_cancel;
@@ -549,7 +551,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
{
- struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
+ struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
@@ -581,7 +583,7 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
} while (cancel != old);
- return cancel(&kiocb->common);
+ return cancel(&kiocb->rw);
}
/*
@@ -1046,15 +1048,6 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
return NULL;
}
-static void kiocb_free(struct aio_kiocb *req)
-{
- if (req->common.ki_filp)
- fput(req->common.ki_filp);
- if (req->ki_eventfd != NULL)
- eventfd_ctx_put(req->ki_eventfd);
- kmem_cache_free(kiocb_cachep, req);
-}
-
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
{
struct aio_ring __user *ring = (void __user *)ctx_id;
@@ -1085,27 +1078,14 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
-static void aio_complete(struct kiocb *kiocb, long res, long res2)
+static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
{
- struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
unsigned tail, pos, head;
unsigned long flags;
- if (kiocb->ki_flags & IOCB_WRITE) {
- struct file *file = kiocb->ki_filp;
-
- /*
- * Tell lockdep we inherited freeze protection from submission
- * thread.
- */
- if (S_ISREG(file_inode(file)->i_mode))
- __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
- file_end_write(file);
- }
-
if (!list_empty_careful(&iocb->ki_list)) {
unsigned long flags;
@@ -1167,11 +1147,12 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
* eventfd. The eventfd_signal() function is safe to be called
* from IRQ context.
*/
- if (iocb->ki_eventfd != NULL)
+ if (iocb->ki_eventfd) {
eventfd_signal(iocb->ki_eventfd, 1);
+ eventfd_ctx_put(iocb->ki_eventfd);
+ }
- /* everything turned out well, dispose of the aiocb. */
- kiocb_free(iocb);
+ kmem_cache_free(kiocb_cachep, iocb);
/*
* We have to order our ring_info tail store above and test
@@ -1434,6 +1415,45 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
return -EINVAL;
}
+static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+ struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
+
+ if (kiocb->ki_flags & IOCB_WRITE) {
+ struct inode *inode = file_inode(kiocb->ki_filp);
+
+ /*
+ * Tell lockdep we inherited freeze protection from submission
+ * thread.
+ */
+ if (S_ISREG(inode->i_mode))
+ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+ file_end_write(kiocb->ki_filp);
+ }
+
+ fput(kiocb->ki_filp);
+ aio_complete(iocb, res, res2);
+}
+
+static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
+{
+ int ret;
+
+ req->ki_filp = fget(iocb->aio_fildes);
+ if (unlikely(!req->ki_filp))
+ return -EBADF;
+ req->ki_complete = aio_complete_rw;
+ req->ki_pos = iocb->aio_offset;
+ req->ki_flags = iocb_flags(req->ki_filp);
+ if (iocb->aio_flags & IOCB_FLAG_RESFD)
+ req->ki_flags |= IOCB_EVENTFD;
+ req->ki_hint = file_write_hint(req->ki_filp);
+ ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+ if (unlikely(ret))
+ fput(req->ki_filp);
+ return ret;
+}
+
static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
bool vectored, bool compat, struct iov_iter *iter)
{
@@ -1453,7 +1473,7 @@ static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
}
-static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
+static inline ssize_t aio_rw_ret(struct kiocb *req, ssize_t ret)
{
switch (ret) {
case -EIOCBQUEUED:
@@ -1469,7 +1489,7 @@ static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
ret = -EINTR;
/*FALLTHRU*/
default:
- aio_complete(req, ret, 0);
+ aio_complete_rw(req, ret, 0);
return 0;
}
}
@@ -1477,59 +1497,79 @@ static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
bool compat)
{
- struct file *file = req->ki_filp;
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
+ struct file *file;
ssize_t ret;
+ ret = aio_prep_rw(req, iocb);
+ if (ret)
+ return ret;
+ file = req->ki_filp;
+
+ ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_READ)))
- return -EBADF;
+ goto out_fput;
+ ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
- return -EINVAL;
+ goto out_fput;
ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
if (ret)
- return ret;
+ goto out_fput;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret)
- ret = aio_ret(req, call_read_iter(file, req, &iter));
+ ret = aio_rw_ret(req, call_read_iter(file, req, &iter));
kfree(iovec);
+out_fput:
+ if (unlikely(ret && ret != -EIOCBQUEUED))
+ fput(file);
return ret;
}
static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
bool compat)
{
- struct file *file = req->ki_filp;
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
+ struct file *file;
ssize_t ret;
+ ret = aio_prep_rw(req, iocb);
+ if (ret)
+ return ret;
+ file = req->ki_filp;
+
+ ret = -EBADF;
if (unlikely(!(file->f_mode & FMODE_WRITE)))
- return -EBADF;
+ goto out_fput;
+ ret = -EINVAL;
if (unlikely(!file->f_op->write_iter))
- return -EINVAL;
+ goto out_fput;
ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
if (ret)
- return ret;
+ goto out_fput;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) {
/*
* Open-code file_start_write here to grab freeze protection,
- * which will be released by another thread in aio_complete().
- * Fool lockdep by telling it the lock got released so that it
- * doesn't complain about the held lock when we return to
- * userspace.
+ * which will be released by another thread in
+ * aio_complete_rw(). Fool lockdep by telling it the lock got
+ * released so that it doesn't complain about the held lock when
+ * we return to userspace.
*/
if (S_ISREG(file_inode(file)->i_mode)) {
__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
}
req->ki_flags |= IOCB_WRITE;
- ret = aio_ret(req, call_write_iter(file, req, &iter));
+ ret = aio_rw_ret(req, call_write_iter(file, req, &iter));
}
kfree(iovec);
+out_fput:
+ if (unlikely(ret && ret != -EIOCBQUEUED))
+ fput(file);
return ret;
}
@@ -1537,7 +1577,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
struct iocb *iocb, bool compat)
{
struct aio_kiocb *req;
- struct file *file;
ssize_t ret;
/* enforce forwards compatibility on users */
@@ -1560,16 +1599,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
if (unlikely(!req))
return -EAGAIN;
- req->common.ki_filp = file = fget(iocb->aio_fildes);
- if (unlikely(!req->common.ki_filp)) {
- ret = -EBADF;
- goto out_put_req;
- }
- req->common.ki_pos = iocb->aio_offset;
- req->common.ki_complete = aio_complete;
- req->common.ki_flags = iocb_flags(req->common.ki_filp);
- req->common.ki_hint = file_write_hint(file);
-
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
* If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -1583,14 +1612,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->ki_eventfd = NULL;
goto out_put_req;
}
-
- req->common.ki_flags |= IOCB_EVENTFD;
- }
-
- ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
- if (unlikely(ret)) {
- pr_debug("EINVAL: aio_rw_flags\n");
- goto out_put_req;
}
ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
@@ -1604,16 +1625,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
switch (iocb->aio_lio_opcode) {
case IOCB_CMD_PREAD:
- ret = aio_read(&req->common, iocb, false, compat);
+ ret = aio_read(&req->rw, iocb, false, compat);
break;
case IOCB_CMD_PWRITE:
- ret = aio_write(&req->common, iocb, false, compat);
+ ret = aio_write(&req->rw, iocb, false, compat);
break;
case IOCB_CMD_PREADV:
- ret = aio_read(&req->common, iocb, true, compat);
+ ret = aio_read(&req->rw, iocb, true, compat);
break;
case IOCB_CMD_PWRITEV:
- ret = aio_write(&req->common, iocb, true, compat);
+ ret = aio_write(&req->rw, iocb, true, compat);
break;
default:
pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
@@ -1633,7 +1654,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
out_put_req:
put_reqs_available(ctx, 1);
percpu_ref_put(&ctx->reqs);
- kiocb_free(req);
+ if (req->ki_eventfd)
+ eventfd_ctx_put(req->ki_eventfd);
+ kmem_cache_free(kiocb_cachep, req);
return ret;
}
--
2.17.0
Simple workqueue offload for now, but prepared for adding a real aio_fsync
method if the need arises. Based on an earlier patch from Dave Chinner.
Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Greg Kroah-Hartman <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
---
fs/aio.c | 43 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 43 insertions(+)
diff --git a/fs/aio.c b/fs/aio.c
index d07c27b3fe88..61d2e6942951 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -156,6 +156,12 @@ struct kioctx {
unsigned id;
};
+struct fsync_iocb {
+ struct work_struct work;
+ struct file *file;
+ bool datasync;
+};
+
/*
* We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
* cancelled or completed (this makes a certain amount of sense because
@@ -172,6 +178,7 @@ struct kioctx {
struct aio_kiocb {
union {
struct kiocb rw;
+ struct fsync_iocb fsync;
};
struct kioctx *ki_ctx;
@@ -1573,6 +1580,36 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
return ret;
}
+static void aio_fsync_work(struct work_struct *work)
+{
+ struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
+ int ret;
+
+ ret = vfs_fsync(req->file, req->datasync);
+ fput(req->file);
+ aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
+}
+
+static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
+{
+ if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
+ iocb->aio_rw_flags))
+ return -EINVAL;
+
+ req->file = fget(iocb->aio_fildes);
+ if (unlikely(!req->file))
+ return -EBADF;
+ if (unlikely(!req->file->f_op->fsync)) {
+ fput(req->file);
+ return -EINVAL;
+ }
+
+ req->datasync = datasync;
+ INIT_WORK(&req->work, aio_fsync_work);
+ schedule_work(&req->work);
+ return -EIOCBQUEUED;
+}
+
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
struct iocb *iocb, bool compat)
{
@@ -1636,6 +1673,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
case IOCB_CMD_PWRITEV:
ret = aio_write(&req->rw, iocb, true, compat);
break;
+ case IOCB_CMD_FSYNC:
+ ret = aio_fsync(&req->fsync, iocb, false);
+ break;
+ case IOCB_CMD_FDSYNC:
+ ret = aio_fsync(&req->fsync, iocb, true);
+ break;
default:
pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
ret = -EINVAL;
--
2.17.0
This is the io_getevents equivalent of ppoll/pselect and allows to
properly mix signals and aio completions (especially with IOCB_CMD_POLL)
and atomically executes the following sequence:
sigset_t origmask;
pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
ret = io_getevents(ctx, min_nr, nr, events, timeout);
pthread_sigmask(SIG_SETMASK, &origmask, NULL);
Note that unlike many other signal related calls we do not pass a sigmask
size, as that would get us to 7 arguments, which aren't easily supported
by the syscall infrastructure. It seems a lot less painful to just add a
new syscall variant in the unlikely case we're going to increase the
sigset size.
Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Greg Kroah-Hartman <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
fs/aio.c | 114 ++++++++++++++++++++++---
include/linux/compat.h | 7 ++
include/linux/syscalls.h | 6 ++
include/uapi/asm-generic/unistd.h | 4 +-
include/uapi/linux/aio_abi.h | 6 ++
kernel/sys_ni.c | 2 +
8 files changed, 130 insertions(+), 11 deletions(-)
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index d6b27dab1b30..14a2f996e543 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -396,3 +396,4 @@
382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free
383 i386 statx sys_statx __ia32_sys_statx
384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
+385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 4dfe42666d0c..cd36232ab62f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -341,6 +341,7 @@
330 common pkey_alloc __x64_sys_pkey_alloc
331 common pkey_free __x64_sys_pkey_free
332 common statx __x64_sys_statx
+333 common io_pgetevents __x64_sys_io_pgetevents
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/aio.c b/fs/aio.c
index 61d2e6942951..f3eae5d5771b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1303,10 +1303,6 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
wait_event_interruptible_hrtimeout(ctx->wait,
aio_read_events(ctx, min_nr, nr, event, &ret),
until);
-
- if (!ret && signal_pending(current))
- ret = -EINTR;
-
return ret;
}
@@ -1921,13 +1917,60 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
struct timespec __user *, timeout)
{
struct timespec64 ts;
+ int ret;
+
+ if (timeout && unlikely(get_timespec64(&ts, timeout)))
+ return -EFAULT;
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+ if (!ret && signal_pending(current))
+ ret = -EINTR;
+ return ret;
+}
- if (timeout) {
- if (unlikely(get_timespec64(&ts, timeout)))
+SYSCALL_DEFINE6(io_pgetevents,
+ aio_context_t, ctx_id,
+ long, min_nr,
+ long, nr,
+ struct io_event __user *, events,
+ struct timespec __user *, timeout,
+ const struct __aio_sigset __user *, usig)
+{
+ struct __aio_sigset ksig = { NULL, };
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 ts;
+ int ret;
+
+ if (timeout && unlikely(get_timespec64(&ts, timeout)))
+ return -EFAULT;
+
+ if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+ return -EFAULT;
+
+ if (ksig.sigmask) {
+ if (ksig.sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))
return -EFAULT;
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+ if (signal_pending(current)) {
+ if (ksig.sigmask) {
+ current->saved_sigmask = sigsaved;
+ set_restore_sigmask();
+ }
+
+ if (!ret)
+ ret = -ERESTARTNOHAND;
+ } else {
+ if (ksig.sigmask)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
}
- return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
+ return ret;
}
#ifdef CONFIG_COMPAT
@@ -1938,13 +1981,64 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
struct compat_timespec __user *, timeout)
{
struct timespec64 t;
+ int ret;
+
+ if (timeout && compat_get_timespec64(&t, timeout))
+ return -EFAULT;
+
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+ if (!ret && signal_pending(current))
+ ret = -EINTR;
+ return ret;
+}
+
- if (timeout) {
- if (compat_get_timespec64(&t, timeout))
+struct __compat_aio_sigset {
+ compat_sigset_t __user *sigmask;
+ compat_size_t sigsetsize;
+};
+
+COMPAT_SYSCALL_DEFINE6(io_pgetevents,
+ compat_aio_context_t, ctx_id,
+ compat_long_t, min_nr,
+ compat_long_t, nr,
+ struct io_event __user *, events,
+ struct compat_timespec __user *, timeout,
+ const struct __compat_aio_sigset __user *, usig)
+{
+ struct __compat_aio_sigset ksig = { NULL, };
+ sigset_t ksigmask, sigsaved;
+ struct timespec64 t;
+ int ret;
+
+ if (timeout && compat_get_timespec64(&t, timeout))
+ return -EFAULT;
+
+ if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
+ return -EFAULT;
+
+ if (ksig.sigmask) {
+ if (ksig.sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (get_compat_sigset(&ksigmask, ksig.sigmask))
return -EFAULT;
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+ ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+ if (signal_pending(current)) {
+ if (ksig.sigmask) {
+ current->saved_sigmask = sigsaved;
+ set_restore_sigmask();
+ }
+ if (!ret)
+ ret = -ERESTARTNOHAND;
+ } else {
+ if (ksig.sigmask)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
}
- return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
+ return ret;
}
#endif
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 081281ad5772..ad192057b887 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -330,6 +330,7 @@ extern int put_compat_rusage(const struct rusage *,
struct compat_rusage __user *);
struct compat_siginfo;
+struct __compat_aio_sigset;
struct compat_dirent {
u32 d_ino;
@@ -553,6 +554,12 @@ asmlinkage long compat_sys_io_getevents(compat_aio_context_t ctx_id,
compat_long_t nr,
struct io_event __user *events,
struct compat_timespec __user *timeout);
+asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id,
+ compat_long_t min_nr,
+ compat_long_t nr,
+ struct io_event __user *events,
+ struct compat_timespec __user *timeout,
+ const struct __compat_aio_sigset __user *usig);
/* fs/cookies.c */
asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, compat_size_t);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 70fcda1a9049..811172fcb916 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -290,6 +290,12 @@ asmlinkage long sys_io_getevents(aio_context_t ctx_id,
long nr,
struct io_event __user *events,
struct timespec __user *timeout);
+asmlinkage long sys_io_pgetevents(aio_context_t ctx_id,
+ long min_nr,
+ long nr,
+ struct io_event __user *events,
+ struct timespec __user *timeout,
+ const struct __aio_sigset *sig);
/* fs/xattr.c */
asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 8bcb186c6f67..42990676a55e 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -732,9 +732,11 @@ __SYSCALL(__NR_pkey_alloc, sys_pkey_alloc)
__SYSCALL(__NR_pkey_free, sys_pkey_free)
#define __NR_statx 291
__SYSCALL(__NR_statx, sys_statx)
+#define __NR_io_pgetevents 292
+__SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
#undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index a04adbc70ddf..2c0a3415beee 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -29,6 +29,7 @@
#include <linux/types.h>
#include <linux/fs.h>
+#include <linux/signal.h>
#include <asm/byteorder.h>
typedef __kernel_ulong_t aio_context_t;
@@ -108,5 +109,10 @@ struct iocb {
#undef IFBIG
#undef IFLITTLE
+struct __aio_sigset {
+ sigset_t __user *sigmask;
+ size_t sigsetsize;
+};
+
#endif /* __LINUX__AIO_ABI_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 9791364925dc..183169c2a75b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -43,7 +43,9 @@ COND_SYSCALL(io_submit);
COND_SYSCALL_COMPAT(io_submit);
COND_SYSCALL(io_cancel);
COND_SYSCALL(io_getevents);
+COND_SYSCALL(io_pgetevents);
COND_SYSCALL_COMPAT(io_getevents);
+COND_SYSCALL_COMPAT(io_pgetevents);
/* fs/xattr.c */
--
2.17.0
These days we don't treat sync iocbs special in the aio completion code as
they never use it. Remove the old comment and BUG_ON given that the
current definition of is_sync_kiocb makes it impossible to hit.
Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
---
fs/aio.c | 9 ---------
1 file changed, 9 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index add46b06be86..7c1855afd723 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1107,15 +1107,6 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
file_end_write(file);
}
- /*
- * Special case handling for sync iocbs:
- * - events go directly into the iocb for fast handling
- * - the sync task with the iocb in its stack holds the single iocb
- * ref, no other paths have a way to get another ref
- * - the sync task helpfully left a reference to itself in the iocb
- */
- BUG_ON(is_sync_kiocb(kiocb));
-
if (iocb->ki_list.next) {
unsigned long flags;
--
2.17.0
Instead of handcoded non-null checks always initialize ki_list to an
empty list and use list_empty / list_empty_careful on it. While we're
at it also error out on a double call to kiocb_set_cancel_fn instead
of ignoring it.
Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Jeff Moyer <[email protected]>
Reviewed-by: Greg Kroah-Hartman <[email protected]>
Reviewed-by: Darrick J. Wong <[email protected]>
---
fs/aio.c | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 7c1855afd723..18507743757a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -553,13 +553,12 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
- spin_lock_irqsave(&ctx->ctx_lock, flags);
-
- if (!req->ki_list.next)
- list_add(&req->ki_list, &ctx->active_reqs);
+ if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
+ return;
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+ list_add_tail(&req->ki_list, &ctx->active_reqs);
req->ki_cancel = cancel;
-
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}
EXPORT_SYMBOL(kiocb_set_cancel_fn);
@@ -1039,7 +1038,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
goto out_put;
percpu_ref_get(&ctx->reqs);
-
+ INIT_LIST_HEAD(&req->ki_list);
req->ki_ctx = ctx;
return req;
out_put:
@@ -1107,7 +1106,7 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
file_end_write(file);
}
- if (iocb->ki_list.next) {
+ if (!list_empty_careful(&iocb->ki_list)) {
unsigned long flags;
spin_lock_irqsave(&ctx->ctx_lock, flags);
--
2.17.0
On Wed, May 02, 2018 at 11:14:45PM +0200, Christoph Hellwig wrote:
> If we release the lockdep write protection token before calling into
> ->write_iter and thus never access the file pointer after an -EIOCBQUEUED
> return from ->write_iter or ->read_iter we don't need this extra
> reference.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
LGTM,
Reviewed-by: Darrick J. Wong <[email protected]>
--D
> ---
> fs/aio.c | 25 ++++++++++++++++---------
> 1 file changed, 16 insertions(+), 9 deletions(-)
>
> diff --git a/fs/aio.c b/fs/aio.c
> index 18507743757a..e5866e2fc331 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1515,16 +1515,19 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
> return ret;
> ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
> if (!ret) {
> - req->ki_flags |= IOCB_WRITE;
> - file_start_write(file);
> - ret = aio_ret(req, call_write_iter(file, req, &iter));
> /*
> - * We release freeze protection in aio_complete(). Fool lockdep
> - * by telling it the lock got released so that it doesn't
> - * complain about held lock when we return to userspace.
> + * Open-code file_start_write here to grab freeze protection,
> + * which will be released by another thread in aio_complete().
> + * Fool lockdep by telling it the lock got released so that it
> + * doesn't complain about the held lock when we return to
> + * userspace.
> */
> - if (S_ISREG(file_inode(file)->i_mode))
> + if (S_ISREG(file_inode(file)->i_mode)) {
> + __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
> __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
> + }
> + req->ki_flags |= IOCB_WRITE;
> + ret = aio_ret(req, call_write_iter(file, req, &iter));
> }
> kfree(iovec);
> return ret;
> @@ -1599,7 +1602,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
> req->ki_user_iocb = user_iocb;
> req->ki_user_data = iocb->aio_data;
>
> - get_file(file);
> switch (iocb->aio_lio_opcode) {
> case IOCB_CMD_PREAD:
> ret = aio_read(&req->common, iocb, false, compat);
> @@ -1618,8 +1620,13 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
> ret = -EINVAL;
> break;
> }
> - fput(file);
>
> + /*
> + * If ret is -EIOCBQUEUED, ownership of the file reference acquired
> + * above passed to the file system, which at this point might have
> + * dropped the reference, so we must be careful to not reference it
> + * once we have called into the file system.
> + */
> if (ret && ret != -EIOCBQUEUED)
> goto out_put_req;
> return 0;
> --
> 2.17.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, May 02, 2018 at 11:14:41PM +0200, Christoph Hellwig wrote:
> Hi all,
>
> this patch adds workqueue based fsync offload. Version of this
> patch have been floating around for a couple years, but we now
> have a user with seastar used by ScyllaDB (who sponsored this
> work) that really wants this in addition to the aio poll support.
> More details are in the patch itself.
>
> Because the iocb types have been defined sine day one (and probably
> were supported by RHEL3) libaio already supports these calls as-is.
>
> This also pulls in the aio cleanups and io_pgetevents support previously
> submitted and review as part of the aio poll series. The aio poll
> series will be resubmitted on top of this series
>
> A git tree is available here:
>
> git://git.infradead.org/users/hch/vfs.git aio-fsync.4
I can live with that. Merged.
Given this:
On Wed, May 02, 2018 at 11:14:48PM +0200, Christoph Hellwig wrote:
> +struct __aio_sigset {
> + sigset_t __user *sigmask;
> + size_t sigsetsize;
> +};
and:
> +asmlinkage long sys_io_pgetevents(aio_context_t ctx_id,
> + long min_nr,
> + long nr,
> + struct io_event __user *events,
> + struct timespec __user *timeout,
> + const struct __aio_sigset *sig);
The following paragraph in the commit message would appear to be
misleading since __aio_sigset contains a size:
> Note that unlike many other signal related calls we do not pass a sigmask
> size, as that would get us to 7 arguments, which aren't easily supported
> by the syscall infrastructure. It seems a lot less painful to just add a
> new syscall variant in the unlikely case we're going to increase the
> sigset size.
Is it possible to correct it before this gets merged?
Thanks
James
On Fri, May 18, 2018 at 09:28:38AM +0100, James Hogan wrote:
> Given this:
>
> On Wed, May 02, 2018 at 11:14:48PM +0200, Christoph Hellwig wrote:
> > +struct __aio_sigset {
> > + sigset_t __user *sigmask;
> > + size_t sigsetsize;
> > +};
>
> and:
>
> > +asmlinkage long sys_io_pgetevents(aio_context_t ctx_id,
> > + long min_nr,
> > + long nr,
> > + struct io_event __user *events,
> > + struct timespec __user *timeout,
> > + const struct __aio_sigset *sig);
>
> The following paragraph in the commit message would appear to be
> misleading since __aio_sigset contains a size:
>
> > Note that unlike many other signal related calls we do not pass a sigmask
> > size, as that would get us to 7 arguments, which aren't easily supported
> > by the syscall infrastructure. It seems a lot less painful to just add a
> > new syscall variant in the unlikely case we're going to increase the
> > sigset size.
>
> Is it possible to correct it before this gets merged?
True, the calling convention change based on feedback. Al, is the
branch stable or you can edit this out?
On Wed, May 02, 2018 at 11:14:48PM +0200, Christoph Hellwig wrote:
> This is the io_getevents equivalent of ppoll/pselect and allows to
> properly mix signals and aio completions (especially with IOCB_CMD_POLL)
> and atomically executes the following sequence:
>
> sigset_t origmask;
>
> pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
> ret = io_getevents(ctx, min_nr, nr, events, timeout);
> pthread_sigmask(SIG_SETMASK, &origmask, NULL);
>
> Note that unlike many other signal related calls we do not pass a sigmask
> size, as that would get us to 7 arguments, which aren't easily supported
> by the syscall infrastructure. It seems a lot less painful to just add a
> new syscall variant in the unlikely case we're going to increase the
> sigset size.
Starting with this commit following code does not compile for me
anymore:
#include <signal.h>
#include <linux/aio_abi.h>
int main()
{
return 0;
}
In file included from /usr/include/linux/signal.h:5,
from /usr/include/linux/aio_abi.h:32,
from include.c:2:
/usr/include/asm/signal.h:16:23: error: conflicting types for ‘sigset_t’
typedef unsigned long sigset_t;
^~~~~~~~
In file included from /usr/include/signal.h:35,
from include.c:1:
/usr/include/bits/types/sigset_t.h:7:20: note: previous declaration of ‘sigset_t’ was here
typedef __sigset_t sigset_t;
^~~~~~~~
In file included from /usr/include/linux/signal.h:5,
from /usr/include/linux/aio_abi.h:32,
from include.c:2:
/usr/include/asm/signal.h:115:8: error: redefinition of ‘struct sigaction’
struct sigaction {
^~~~~~~~~
In file included from /usr/include/signal.h:226,
from include.c:1:
/usr/include/bits/sigaction.h:27:8: note: originally defined here
struct sigaction
^~~~~~~~~
[and much more]
Before this commit it compiles without errors.
Adrian
On Wed, Jul 04, 2018 at 04:21:16PM +0200, Adrian Reber wrote:
> In file included from /usr/include/linux/signal.h:5,
> from /usr/include/linux/aio_abi.h:32,
> from include.c:2:
> /usr/include/asm/signal.h:16:23: error: conflicting types for ‘sigset_t’
> typedef unsigned long sigset_t;
> ^~~~~~~~
> In file included from /usr/include/signal.h:35,
> from include.c:1:
> /usr/include/bits/types/sigset_t.h:7:20: note: previous declaration of ‘sigset_t’ was here
> typedef __sigset_t sigset_t;
I guess we could do something like the patch below, although it is
rather ugly:
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index 75846164290e..b7705ad66d78 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -29,7 +29,11 @@
#include <linux/types.h>
#include <linux/fs.h>
+#ifdef __KERNEL__
#include <linux/signal.h>
+#else
+#include <signal.h>
+#endif
#include <asm/byteorder.h>
typedef __kernel_ulong_t aio_context_t;
Am Sonntag, 8. Juli 2018, 22:44:00 CEST schrieb Christoph Hellwig:
Hi Christoph,
>
> diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
> index 75846164290e..b7705ad66d78 100644
> --- a/include/uapi/linux/aio_abi.h
> +++ b/include/uapi/linux/aio_abi.h
> @@ -29,7 +29,11 @@
>
> #include <linux/types.h>
> #include <linux/fs.h>
> +#ifdef __KERNEL__
> #include <linux/signal.h>
> +#else
> +#include <signal.h>
> +#endif
> #include <asm/byteorder.h>
Without such a patch, libkcapi fails to compile as well. See [1].
Apart from your suggested patch above, do you have another suggestion how make
the user space code compile?
[1] https://github.com/smuellerDD/libkcapi/issues/59
Thanks
Stephan
Am Sonntag, 8. Juli 2018, 22:44:00 CEST schrieb Christoph Hellwig:
Hi Christoph,
>
> diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
> index 75846164290e..b7705ad66d78 100644
> --- a/include/uapi/linux/aio_abi.h
> +++ b/include/uapi/linux/aio_abi.h
> @@ -29,7 +29,11 @@
>
> #include <linux/types.h>
> #include <linux/fs.h>
> +#ifdef __KERNEL__
> #include <linux/signal.h>
> +#else
> +#include <signal.h>
> +#endif
> #include <asm/byteorder.h>
Without such a patch, libkcapi fails to compile as well. See [1].
Apart from your suggested patch above, do you have another suggestion how make
the user space code compile?
[1] https://github.com/smuellerDD/libkcapi/issues/59
Thanks
Stephan
On Sun, Jul 08, 2018 at 10:44:00PM +0200, Christoph Hellwig wrote:
> On Wed, Jul 04, 2018 at 04:21:16PM +0200, Adrian Reber wrote:
> > In file included from /usr/include/linux/signal.h:5,
> > from /usr/include/linux/aio_abi.h:32,
> > from include.c:2:
> > /usr/include/asm/signal.h:16:23: error: conflicting types for ‘sigset_t’
> > typedef unsigned long sigset_t;
> > ^~~~~~~~
> > In file included from /usr/include/signal.h:35,
> > from include.c:1:
> > /usr/include/bits/types/sigset_t.h:7:20: note: previous declaration of ‘sigset_t’ was here
> > typedef __sigset_t sigset_t;
>
> I guess we could do something like the patch below, although it is
> rather ugly:
>
> diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
> index 75846164290e..b7705ad66d78 100644
> --- a/include/uapi/linux/aio_abi.h
> +++ b/include/uapi/linux/aio_abi.h
> @@ -29,7 +29,11 @@
>
> #include <linux/types.h>
> #include <linux/fs.h>
> +#ifdef __KERNEL__
> #include <linux/signal.h>
> +#else
> +#include <signal.h>
> +#endif
I think we can not do this because this header specifies the kernel
API, but signal.h is provided by libc and sigset_t can be defined
differently there:
[avagin@laptop ~]$ cat test.c
#ifdef TEST_LINUX_SIGNAL
# include <glob.h>
# include <linux/signal.h>
#else
# include <signal.h>
#endif
#include <stdio.h>
int main()
{
printf("sizeof(sigset_t) = %d\n", sizeof(sigset_t));
return 0;
}
[avagin@laptop ~]$ gcc -DTEST_LINUX_SIGNAL test.c && ./a.out
sizeof(sigset_t) = 8
[avagin@laptop ~]$ gcc test.c && ./a.out
sizeof(sigset_t) = 128
[avagin@laptop include]$ rpm -qf /usr/include/signal.h
glibc-headers-2.27-8.fc28.i686
glibc-headers-2.27-8.fc28.x86_64
[avagin@laptop include]$ rpm -qf /usr/include/linux/signal.h
kernel-headers-4.16.5-300.fc28.x86_64
> #include <asm/byteorder.h>
>
> typedef __kernel_ulong_t aio_context_t;
On Mon, Jul 09, 2018 at 09:21:53PM +0200, Stephan M?ller wrote:
> Without such a patch, libkcapi fails to compile as well. See [1].
>
> Apart from your suggested patch above, do you have another suggestion how make
> the user space code compile?
The only other option would be to move the __aio_sigset delcaration
out of the uapi header, similar to what we do for pselect. This is
also rather ugly, but I gess that's what we'll have to do. I will
prepare a patch for it.