From: Adam Manzanares <[email protected]>
This is the per-I/O equivalent of the ioprio_set system call.
See the following link for performance implications on a SATA HDD:
https://lkml.org/lkml/2016/12/6/495
First patch factors ioprio_check_cap function out of ioprio_set system call to
also be used by the aio ioprio interface.
Second patch converts kiocb ki_hint field to a u16 to avoid kiocb bloat.
Third patch passes ioprio hint from aio iocb to kiocb and enables block_dev
usage of the per I/O ioprio feature.
v2: merge patches
use IOCB_FLAG_IOPRIO
validate intended use with IOCB_IOPRIO
add linux-api and linux-block to cc
v3: add ioprio_check_cap function
convert kiocb ki_hint to u16
use ioprio_check_cap when adding ioprio to kiocb in aio.c
Adam Manzanares (3):
block: add ioprio_check_cap function
fs: Convert kiocb rw_hint from enum to u16
fs: Add aio iopriority support for block_dev
block/ioprio.c | 22 ++++++++++++++++------
fs/aio.c | 16 ++++++++++++++++
fs/block_dev.c | 2 ++
include/linux/fs.h | 17 +++++++++++++++--
include/linux/ioprio.h | 2 ++
include/uapi/linux/aio_abi.h | 1 +
6 files changed, 52 insertions(+), 8 deletions(-)
--
2.15.1
From: Adam Manzanares <[email protected]>
In order to avoid kiocb bloat for per command iopriority support, rw_hint
is converted from enum to a u16. Added a guard around ki_hint assigment.
Signed-off-by: Adam Manzanares <[email protected]>
---
include/linux/fs.h | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..7a90ce387e00 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -284,6 +284,8 @@ enum rw_hint {
WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
};
+#define MAX_KI_HINT ((1 << 16) - 1) /* ki_hint type is u16 */
+
#define IOCB_EVENTFD (1 << 0)
#define IOCB_APPEND (1 << 1)
#define IOCB_DIRECT (1 << 2)
@@ -299,7 +301,7 @@ struct kiocb {
void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
void *private;
int ki_flags;
- enum rw_hint ki_hint;
+ u16 ki_hint;
} __randomize_layout;
static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -1927,12 +1929,21 @@ static inline enum rw_hint file_write_hint(struct file *file)
static inline int iocb_flags(struct file *file);
+/* ki_hint changed from enum to u16, make sure rw_hint fits into u16 */
+static inline u16 ki_hint_valid(enum rw_hint hint)
+{
+ if (hint > MAX_KI_HINT)
+ return 0;
+
+ return hint;
+}
+
static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
*kiocb = (struct kiocb) {
.ki_filp = filp,
.ki_flags = iocb_flags(filp),
- .ki_hint = file_write_hint(filp),
+ .ki_hint = ki_hint_valid(file_write_hint(filp)),
};
}
--
2.15.1
From: Adam Manzanares <[email protected]>
Aio per command iopriority support introduces a second interface between
userland and the kernel capable of passing iopriority. The aio interface also
needs the ability to verify that the submitting context has sufficient
priviledges to submit IOPRIO_RT commands. This patch creates the
ioprio_check_cap function to be used by the ioprio_set system call and also by
the aio interface.
Signed-off-by: Adam Manzanares <[email protected]>
---
block/ioprio.c | 22 ++++++++++++++++------
include/linux/ioprio.h | 2 ++
2 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/block/ioprio.c b/block/ioprio.c
index 6f5d0b6625e3..f9821080c92c 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -61,15 +61,10 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
}
EXPORT_SYMBOL_GPL(set_task_ioprio);
-SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
+int ioprio_check_cap(int ioprio)
{
int class = IOPRIO_PRIO_CLASS(ioprio);
int data = IOPRIO_PRIO_DATA(ioprio);
- struct task_struct *p, *g;
- struct user_struct *user;
- struct pid *pgrp;
- kuid_t uid;
- int ret;
switch (class) {
case IOPRIO_CLASS_RT:
@@ -92,6 +87,21 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
return -EINVAL;
}
+ return 0;
+}
+
+SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
+{
+ struct task_struct *p, *g;
+ struct user_struct *user;
+ struct pid *pgrp;
+ kuid_t uid;
+ int ret;
+
+ ret = ioprio_check_cap(ioprio);
+ if (ret)
+ return ret;
+
ret = -ESRCH;
rcu_read_lock();
switch (which) {
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 627efac73e6d..4a28cec49ec3 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -77,4 +77,6 @@ extern int ioprio_best(unsigned short aprio, unsigned short bprio);
extern int set_task_ioprio(struct task_struct *task, int ioprio);
+extern int ioprio_check_cap(int ioprio);
+
#endif
--
2.15.1
From: Adam Manzanares <[email protected]>
When IOCB_FLAG_IOPRIO is set on the iocb aio_flags field, then we set the
newly added kiocb ki_ioprio field to the value in the iocb aio_reqprio field.
When a bio is created for an aio request by the block dev we set the priority
value of the bio to the user supplied value.
Signed-off-by: Adam Manzanares <[email protected]>
---
fs/aio.c | 16 ++++++++++++++++
fs/block_dev.c | 2 ++
include/linux/fs.h | 2 ++
include/uapi/linux/aio_abi.h | 1 +
4 files changed, 21 insertions(+)
diff --git a/fs/aio.c b/fs/aio.c
index 88d7927ffbc6..f43d1d3a2e39 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1597,6 +1597,22 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->common.ki_flags |= IOCB_EVENTFD;
}
+ if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
+ /*
+ * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
+ * aio_reqprio is interpreted as an I/O scheduling
+ * class and priority.
+ */
+ ret = ioprio_check_cap(iocb->aio_reqprio);
+ if (ret) {
+ pr_debug("aio ioprio check cap error\n");
+ goto out_put_req;
+ }
+
+ req->common.ki_ioprio = iocb->aio_reqprio;
+ req->common.ki_flags |= IOCB_IOPRIO;
+ }
+
ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
if (unlikely(ret)) {
pr_debug("EINVAL: aio_rw_flags\n");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7ec920e27065..970bef79caa6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -355,6 +355,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
bio->bi_write_hint = iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
+ if (iocb->ki_flags & IOCB_IOPRIO)
+ bio->bi_ioprio = iocb->ki_ioprio;
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7a90ce387e00..3415e83f6350 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -294,6 +294,7 @@ enum rw_hint {
#define IOCB_SYNC (1 << 5)
#define IOCB_WRITE (1 << 6)
#define IOCB_NOWAIT (1 << 7)
+#define IOCB_IOPRIO (1 << 8)
struct kiocb {
struct file *ki_filp;
@@ -302,6 +303,7 @@ struct kiocb {
void *private;
int ki_flags;
u16 ki_hint;
+ u16 ki_ioprio; /* See linux/ioprio.h */
} __randomize_layout;
static inline bool is_sync_kiocb(struct kiocb *kiocb)
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index a04adbc70ddf..d4593a6062ef 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -54,6 +54,7 @@ enum {
* is valid.
*/
#define IOCB_FLAG_RESFD (1 << 0)
+#define IOCB_FLAG_IOPRIO (1 << 1)
/* read() from /dev/aio returns these structures. */
struct io_event {
--
2.15.1
Hi Adam,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linus/master]
[also build test ERROR on v4.17-rc4 next-20180508]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/adam-manzanares-wdc-com/block-add-ioprio_check_cap-function/20180509-094058
config: x86_64-randconfig-s0-05091255 (attached as .config)
compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64
All errors (new ones prefixed by >>):
fs/aio.c: In function 'io_submit_one':
>> fs/aio.c:1606:9: error: implicit declaration of function 'ioprio_check_cap' [-Werror=implicit-function-declaration]
ret = ioprio_check_cap(iocb->aio_reqprio);
^~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
vim +/ioprio_check_cap +1606 fs/aio.c
1545
1546 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1547 struct iocb *iocb, bool compat)
1548 {
1549 struct aio_kiocb *req;
1550 struct file *file;
1551 ssize_t ret;
1552
1553 /* enforce forwards compatibility on users */
1554 if (unlikely(iocb->aio_reserved2)) {
1555 pr_debug("EINVAL: reserve field set\n");
1556 return -EINVAL;
1557 }
1558
1559 /* prevent overflows */
1560 if (unlikely(
1561 (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
1562 (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
1563 ((ssize_t)iocb->aio_nbytes < 0)
1564 )) {
1565 pr_debug("EINVAL: overflow check\n");
1566 return -EINVAL;
1567 }
1568
1569 req = aio_get_req(ctx);
1570 if (unlikely(!req))
1571 return -EAGAIN;
1572
1573 req->common.ki_filp = file = fget(iocb->aio_fildes);
1574 if (unlikely(!req->common.ki_filp)) {
1575 ret = -EBADF;
1576 goto out_put_req;
1577 }
1578 req->common.ki_pos = iocb->aio_offset;
1579 req->common.ki_complete = aio_complete;
1580 req->common.ki_flags = iocb_flags(req->common.ki_filp);
1581 req->common.ki_hint = file_write_hint(file);
1582
1583 if (iocb->aio_flags & IOCB_FLAG_RESFD) {
1584 /*
1585 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
1586 * instance of the file* now. The file descriptor must be
1587 * an eventfd() fd, and will be signaled for each completed
1588 * event using the eventfd_signal() function.
1589 */
1590 req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
1591 if (IS_ERR(req->ki_eventfd)) {
1592 ret = PTR_ERR(req->ki_eventfd);
1593 req->ki_eventfd = NULL;
1594 goto out_put_req;
1595 }
1596
1597 req->common.ki_flags |= IOCB_EVENTFD;
1598 }
1599
1600 if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
1601 /*
1602 * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
1603 * aio_reqprio is interpreted as an I/O scheduling
1604 * class and priority.
1605 */
> 1606 ret = ioprio_check_cap(iocb->aio_reqprio);
1607 if (ret) {
1608 pr_debug("aio ioprio check cap error\n");
1609 goto out_put_req;
1610 }
1611
1612 req->common.ki_ioprio = iocb->aio_reqprio;
1613 req->common.ki_flags |= IOCB_IOPRIO;
1614 }
1615
1616 ret = kiocb_set_rw_flags(&req->common, iocb->aio_rw_flags);
1617 if (unlikely(ret)) {
1618 pr_debug("EINVAL: aio_rw_flags\n");
1619 goto out_put_req;
1620 }
1621
1622 ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
1623 if (unlikely(ret)) {
1624 pr_debug("EFAULT: aio_key\n");
1625 goto out_put_req;
1626 }
1627
1628 req->ki_user_iocb = user_iocb;
1629 req->ki_user_data = iocb->aio_data;
1630
1631 get_file(file);
1632 switch (iocb->aio_lio_opcode) {
1633 case IOCB_CMD_PREAD:
1634 ret = aio_read(&req->common, iocb, false, compat);
1635 break;
1636 case IOCB_CMD_PWRITE:
1637 ret = aio_write(&req->common, iocb, false, compat);
1638 break;
1639 case IOCB_CMD_PREADV:
1640 ret = aio_read(&req->common, iocb, true, compat);
1641 break;
1642 case IOCB_CMD_PWRITEV:
1643 ret = aio_write(&req->common, iocb, true, compat);
1644 break;
1645 default:
1646 pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
1647 ret = -EINVAL;
1648 break;
1649 }
1650 fput(file);
1651
1652 if (ret && ret != -EIOCBQUEUED)
1653 goto out_put_req;
1654 return 0;
1655 out_put_req:
1656 put_reqs_available(ctx, 1);
1657 percpu_ref_put(&ctx->reqs);
1658 kiocb_free(req);
1659 return ret;
1660 }
1661
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
On Tue, May 08, 2018 at 10:42:01AM -0700, [email protected] wrote:
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 760d8da1b6c7..7a90ce387e00 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -284,6 +284,8 @@ enum rw_hint {
> WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
> };
>
> +#define MAX_KI_HINT ((1 << 16) - 1) /* ki_hint type is u16 */
> +
Do we really think there will be *ever* be a need for more than 16 I/O
priority levels? I would much rather use the low four bits of KI_HINT
for the priority level, and reserve the rest of the 16 bits in KI_HINT
for some future use. (For example, we might want to use some number
of bits for a stream ID.)
- Ted
On 5/9/18 7:34 AM, Theodore Y. Ts'o wrote:
> On Tue, May 08, 2018 at 10:42:01AM -0700, [email protected] wrote:
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 760d8da1b6c7..7a90ce387e00 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -284,6 +284,8 @@ enum rw_hint {
>> WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
>> };
>>
>> +#define MAX_KI_HINT ((1 << 16) - 1) /* ki_hint type is u16 */
>> +
>
> Do we really think there will be *ever* be a need for more than 16 I/O
> priority levels? I would much rather use the low four bits of KI_HINT
> for the priority level, and reserve the rest of the 16 bits in KI_HINT
> for some future use. (For example, we might want to use some number
> of bits for a stream ID.)
Streams is essentially the only thing ki_hint is currently used for,
with the write life time hints mapping to a stream. The idea for the
user side API was to have other things than just write life time hints.
Since Adam wants to do priorities, he'd either need to pack into the
existing ki_hint, or do this patch does, which is make it smaller and
add a new member. I think the latter is cleaner.
--
Jens Axboe
On Wed, May 09, 2018 at 08:23:00AM -0600, Jens Axboe wrote:
> Streams is essentially the only thing ki_hint is currently used for,
> with the write life time hints mapping to a stream. The idea for the
> user side API was to have other things than just write life time hints.
>
> Since Adam wants to do priorities, he'd either need to pack into the
> existing ki_hint, or do this patch does, which is make it smaller and
> add a new member. I think the latter is cleaner.
Fair enough; but maybe we can use a u8 instead of a u16? 65,535
priorities still seem like way more than would ever make sense. I
think 256 priorities is still way to many, but it's simpler while
still reserving number of bits for future se.
- Ted
On 5/9/18 11:21 AM, Theodore Y. Ts'o wrote:
> On Wed, May 09, 2018 at 08:23:00AM -0600, Jens Axboe wrote:
>> Streams is essentially the only thing ki_hint is currently used for,
>> with the write life time hints mapping to a stream. The idea for the
>> user side API was to have other things than just write life time hints.
>>
>> Since Adam wants to do priorities, he'd either need to pack into the
>> existing ki_hint, or do this patch does, which is make it smaller and
>> add a new member. I think the latter is cleaner.
>
> Fair enough; but maybe we can use a u8 instead of a u16? 65,535
> priorities still seem like way more than would ever make sense. I
> think 256 priorities is still way to many, but it's simpler while
> still reserving number of bits for future se.
The intention was to mimic the ioprio_set system call, which uses 3 bits
for a prio class and 13 bits for a prio_value.
IDK what is the right amount of bits to use, but the existing use of
bits seemed flexible enough to support many types of applications and
devices.
>
> - Ted
>