2019-03-08 21:30:28

by Andrea Righi

[permalink] [raw]
Subject: [PATCH v3] blkcg: prevent priority inversion problem during sync()

When sync(2) is executed from a high-priority cgroup, the process is
forced to wait the completion of the entire outstanding writeback I/O,
even the I/O that was originally generated by low-priority cgroups
potentially.

This may cause massive latencies to random processes (even those running
in the root cgroup) that shouldn't be I/O-throttled at all, similarly to
a classic priority inversion problem.

Prevent this problem by saving a list of blkcg's that are waiting for
writeback: every time a sync(2) is executed the current blkcg is added
to the list.

Then, when I/O is throttled, if there's a blkcg waiting for writeback
different than the current blkcg, no throttling is applied (we can
probably refine this logic later, i.e., a better policy could be to
adjust the I/O rate using the blkcg with the highest speed from the list
of waiters).

See also:
https://lkml.org/lkml/2019/3/7/640

Signed-off-by: Andrea Righi <[email protected]>
---
Changes in v3:
- drop sync(2) isolation patches (this will be addressed by another
patch, potentially operating at the fs namespace level)
- use a per-bdi lock and a per-bdi list instead of a global lock and a
global list to save the list of sync(2) waiters

block/blk-cgroup.c | 130 +++++++++++++++++++++++++++++++
block/blk-throttle.c | 11 ++-
fs/fs-writeback.c | 5 ++
fs/sync.c | 8 +-
include/linux/backing-dev-defs.h | 2 +
include/linux/blk-cgroup.h | 25 ++++++
mm/backing-dev.c | 2 +
7 files changed, 179 insertions(+), 4 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2bed5725aa03..b380d678cfc2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1351,6 +1351,136 @@ struct cgroup_subsys io_cgrp_subsys = {
};
EXPORT_SYMBOL_GPL(io_cgrp_subsys);

+#ifdef CONFIG_CGROUP_WRITEBACK
+struct blkcg_wb_sleeper {
+ struct blkcg *blkcg;
+ refcount_t refcnt;
+ struct list_head node;
+};
+
+static struct blkcg_wb_sleeper *
+blkcg_wb_sleeper_find(struct blkcg *blkcg, struct backing_dev_info *bdi)
+{
+ struct blkcg_wb_sleeper *bws;
+
+ list_for_each_entry(bws, &bdi->cgwb_waiters, node)
+ if (bws->blkcg == blkcg)
+ return bws;
+ return NULL;
+}
+
+static void
+blkcg_wb_sleeper_add(struct backing_dev_info *bdi, struct blkcg_wb_sleeper *bws)
+{
+ list_add(&bws->node, &bdi->cgwb_waiters);
+}
+
+static void
+blkcg_wb_sleeper_del(struct backing_dev_info *bdi, struct blkcg_wb_sleeper *bws)
+{
+ list_del_init(&bws->node);
+}
+
+/**
+ * blkcg_wb_waiters_on_bdi - check for writeback waiters on a block device
+ * @blkcg: current blkcg cgroup
+ * @bdi: block device to check
+ *
+ * Return true if any other blkcg different than the current one is waiting for
+ * writeback on the target block device, false otherwise.
+ */
+bool blkcg_wb_waiters_on_bdi(struct blkcg *blkcg, struct backing_dev_info *bdi)
+{
+ struct blkcg_wb_sleeper *bws;
+ bool ret = false;
+
+ if (likely(list_empty(&bdi->cgwb_waiters)))
+ return false;
+ spin_lock(&bdi->cgwb_waiters_lock);
+ list_for_each_entry(bws, &bdi->cgwb_waiters, node)
+ if (bws->blkcg != blkcg) {
+ ret = true;
+ break;
+ }
+ spin_unlock(&bdi->cgwb_waiters_lock);
+
+ return ret;
+}
+
+/**
+ * blkcg_start_wb_wait_on_bdi - add current blkcg to writeback waiters list
+ * @bdi: target block device
+ *
+ * Add current blkcg to the list of writeback waiters on target block device.
+ */
+void blkcg_start_wb_wait_on_bdi(struct backing_dev_info *bdi)
+{
+ struct blkcg_wb_sleeper *new_bws, *bws;
+ struct blkcg *blkcg;
+
+ new_bws = kzalloc(sizeof(*new_bws), GFP_KERNEL);
+ if (unlikely(!new_bws))
+ return;
+
+ rcu_read_lock();
+ blkcg = blkcg_from_current();
+ if (likely(blkcg)) {
+ /* Check if blkcg is already sleeping on bdi */
+ spin_lock_bh(&bdi->cgwb_waiters_lock);
+ bws = blkcg_wb_sleeper_find(blkcg, bdi);
+ if (bws) {
+ refcount_inc(&bws->refcnt);
+ } else {
+ /* Add current blkcg as a new wb sleeper on bdi */
+ css_get(&blkcg->css);
+ new_bws->blkcg = blkcg;
+ refcount_set(&new_bws->refcnt, 1);
+ blkcg_wb_sleeper_add(bdi, new_bws);
+ new_bws = NULL;
+ }
+ spin_unlock_bh(&bdi->cgwb_waiters_lock);
+ }
+ rcu_read_unlock();
+
+ kfree(new_bws);
+}
+
+/**
+ * blkcg_stop_wb_wait_on_bdi - remove current blkcg from writeback waiters list
+ * @bdi: target block device
+ *
+ * Remove current blkcg from the list of writeback waiters on target block
+ * device.
+ */
+void blkcg_stop_wb_wait_on_bdi(struct backing_dev_info *bdi)
+{
+ struct blkcg_wb_sleeper *bws = NULL;
+ struct blkcg *blkcg;
+
+ rcu_read_lock();
+ blkcg = blkcg_from_current();
+ if (!blkcg) {
+ rcu_read_unlock();
+ return;
+ }
+ spin_lock_bh(&bdi->cgwb_waiters_lock);
+ bws = blkcg_wb_sleeper_find(blkcg, bdi);
+ if (unlikely(!bws)) {
+ /* blkcg_start/stop_wb_wait_on_bdi() mismatch */
+ WARN_ON(1);
+ goto out_unlock;
+ }
+ if (refcount_dec_and_test(&bws->refcnt)) {
+ blkcg_wb_sleeper_del(bdi, bws);
+ css_put(&blkcg->css);
+ kfree(bws);
+ }
+out_unlock:
+ spin_unlock_bh(&bdi->cgwb_waiters_lock);
+ rcu_read_unlock();
+}
+#endif
+
/**
* blkcg_activate_policy - activate a blkcg policy on a request_queue
* @q: request_queue of interest
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1b97a73d2fb1..da817896cded 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -970,9 +970,13 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
{
bool rw = bio_data_dir(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
+ struct throtl_data *td = tg->td;
+ struct request_queue *q = td->queue;
+ struct backing_dev_info *bdi = q->backing_dev_info;
+ struct blkcg_gq *blkg = tg_to_blkg(tg);

/*
- * Currently whole state machine of group depends on first bio
+ * Currently whole state machine of group depends on first bio
* queued in the group bio list. So one should not be calling
* this function with a different bio if there are other bios
* queued.
@@ -981,8 +985,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));

/* If tg->bps = -1, then BW is unlimited */
- if (tg_bps_limit(tg, rw) == U64_MAX &&
- tg_iops_limit(tg, rw) == UINT_MAX) {
+ if (blkcg_wb_waiters_on_bdi(blkg->blkcg, bdi) ||
+ (tg_bps_limit(tg, rw) == U64_MAX &&
+ tg_iops_limit(tg, rw) == UINT_MAX)) {
if (wait)
*wait = 0;
return true;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 36855c1f8daf..77c039a0ec25 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -28,6 +28,7 @@
#include <linux/tracepoint.h>
#include <linux/device.h>
#include <linux/memcontrol.h>
+#include <linux/blk-cgroup.h>
#include "internal.h"

/*
@@ -2446,6 +2447,8 @@ void sync_inodes_sb(struct super_block *sb)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));

+ blkcg_start_wb_wait_on_bdi(bdi);
+
/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
bdi_down_write_wb_switch_rwsem(bdi);
bdi_split_work_to_wbs(bdi, &work, false);
@@ -2453,6 +2456,8 @@ void sync_inodes_sb(struct super_block *sb)
bdi_up_write_wb_switch_rwsem(bdi);

wait_sb_inodes(sb);
+
+ blkcg_stop_wb_wait_on_bdi(bdi);
}
EXPORT_SYMBOL(sync_inodes_sb);

diff --git a/fs/sync.c b/fs/sync.c
index b54e0541ad89..3958b8f98b85 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -16,6 +16,7 @@
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/backing-dev.h>
+#include <linux/blk-cgroup.h>
#include "internal.h"

#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
@@ -76,8 +77,13 @@ static void sync_inodes_one_sb(struct super_block *sb, void *arg)

static void sync_fs_one_sb(struct super_block *sb, void *arg)
{
- if (!sb_rdonly(sb) && sb->s_op->sync_fs)
+ struct backing_dev_info *bdi = sb->s_bdi;
+
+ if (!sb_rdonly(sb) && sb->s_op->sync_fs) {
+ blkcg_start_wb_wait_on_bdi(bdi);
sb->s_op->sync_fs(sb, *(int *)arg);
+ blkcg_stop_wb_wait_on_bdi(bdi);
+ }
}

static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 07e02d6df5ad..095e4dd0427b 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -191,6 +191,8 @@ struct backing_dev_info {
struct rb_root cgwb_congested_tree; /* their congested states */
struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */
struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
+ struct list_head cgwb_waiters; /* list of all waiters for writeback */
+ spinlock_t cgwb_waiters_lock; /* protect cgwb_waiters list */
#else
struct bdi_writeback_congested *wb_congested;
#endif
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 76c61318fda5..66d7b6901c77 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -56,6 +56,7 @@ struct blkcg {

struct list_head all_blkcgs_node;
#ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_wait_node;
struct list_head cgwb_list;
refcount_t cgwb_refcnt;
#endif
@@ -252,6 +253,12 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
return css ? container_of(css, struct blkcg, css) : NULL;
}

+static inline struct blkcg *blkcg_from_current(void)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return css_to_blkcg(blkcg_css());
+}
+
/**
* __bio_blkcg - internal, inconsistent version to get blkcg
*
@@ -454,6 +461,10 @@ static inline void blkcg_cgwb_put(struct blkcg *blkcg)
blkcg_destroy_blkgs(blkcg);
}

+bool blkcg_wb_waiters_on_bdi(struct blkcg *blkcg, struct backing_dev_info *bdi);
+void blkcg_start_wb_wait_on_bdi(struct backing_dev_info *bdi);
+void blkcg_stop_wb_wait_on_bdi(struct backing_dev_info *bdi);
+
#else

static inline void blkcg_cgwb_get(struct blkcg *blkcg) { }
@@ -464,6 +475,14 @@ static inline void blkcg_cgwb_put(struct blkcg *blkcg)
blkcg_destroy_blkgs(blkcg);
}

+static inline bool
+blkcg_wb_waiters_on_bdi(struct blkcg *blkcg, struct backing_dev_info *bdi)
+{
+ return false;
+}
+static inline void blkcg_start_wb_wait_on_bdi(struct backing_dev_info *bdi) { }
+static inline void blkcg_stop_wb_wait_on_bdi(struct backing_dev_info *bdi) { }
+
#endif

/**
@@ -772,6 +791,7 @@ static inline void blkcg_bio_issue_init(struct bio *bio)
static inline bool blkcg_bio_issue_check(struct request_queue *q,
struct bio *bio)
{
+ struct backing_dev_info *bdi = q->backing_dev_info;
struct blkcg_gq *blkg;
bool throtl = false;

@@ -788,6 +808,11 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,

blkg = bio->bi_blkg;

+ local_bh_disable();
+ if (blkcg_wb_waiters_on_bdi(blkg->blkcg, bdi))
+ bio_set_flag(bio, BIO_THROTTLED);
+ local_bh_enable();
+
throtl = blk_throtl_bio(q, blkg, bio);

if (!throtl) {
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 72e6d0c55cfa..8848d26e8bf6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -686,10 +686,12 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
int ret;

+ INIT_LIST_HEAD(&bdi->cgwb_waiters);
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
init_rwsem(&bdi->wb_switch_rwsem);
+ spin_lock_init(&bdi->cgwb_waiters_lock);

ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
--
2.20.1



2019-03-09 18:54:47

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v3] blkcg: prevent priority inversion problem during sync()

Hi Andrea,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.0 next-20190306]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url: https://github.com/0day-ci/linux/commits/Andrea-Righi/blkcg-prevent-priority-inversion-problem-during-sync/20190310-020543
config: riscv-tinyconfig (attached as .config)
compiler: riscv64-linux-gcc (GCC) 8.2.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=8.2.0 make.cross ARCH=riscv

All errors (new ones prefixed by >>):

fs/sync.c: In function 'sync_fs_one_sb':
>> fs/sync.c:83:3: error: implicit declaration of function 'blkcg_start_wb_wait_on_bdi' [-Werror=implicit-function-declaration]
blkcg_start_wb_wait_on_bdi(bdi);
^~~~~~~~~~~~~~~~~~~~~~~~~~
>> fs/sync.c:85:3: error: implicit declaration of function 'blkcg_stop_wb_wait_on_bdi' [-Werror=implicit-function-declaration]
blkcg_stop_wb_wait_on_bdi(bdi);
^~~~~~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors

vim +/blkcg_start_wb_wait_on_bdi +83 fs/sync.c

77
78 static void sync_fs_one_sb(struct super_block *sb, void *arg)
79 {
80 struct backing_dev_info *bdi = sb->s_bdi;
81
82 if (!sb_rdonly(sb) && sb->s_op->sync_fs) {
> 83 blkcg_start_wb_wait_on_bdi(bdi);
84 sb->s_op->sync_fs(sb, *(int *)arg);
> 85 blkcg_stop_wb_wait_on_bdi(bdi);
86 }
87 }
88

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation


Attachments:
(No filename) (1.81 kB)
.config.gz (4.47 kB)
Download all attachments

2019-03-09 21:47:26

by Andrea Righi

[permalink] [raw]
Subject: [PATCH v4] blkcg: prevent priority inversion problem during sync()

When sync(2) is executed from a high-priority cgroup, the process is
forced to wait the completion of the entire outstanding writeback I/O,
even the I/O that was originally generated by low-priority cgroups
potentially.

This may cause massive latencies to random processes (even those running
in the root cgroup) that shouldn't be I/O-throttled at all, similarly to
a classic priority inversion problem.

Prevent this problem by saving a list of blkcg's that are waiting for
writeback: every time a sync(2) is executed the current blkcg is added
to the list.

Then, when I/O is throttled, if there's a blkcg waiting for writeback
different than the current blkcg, no throttling is applied (we can
probably refine this logic later, i.e., a better policy could be to
adjust the I/O rate using the blkcg with the highest speed from the list
of waiters).

See also:
https://lkml.org/lkml/2019/3/7/640

Signed-off-by: Andrea Righi <[email protected]>
---
Changes in v4:
- fix a build bug when CONFIG_BLOCK is unset

block/blk-cgroup.c | 130 +++++++++++++++++++++++++++++++
block/blk-throttle.c | 11 ++-
fs/fs-writeback.c | 5 ++
fs/sync.c | 8 +-
include/linux/backing-dev-defs.h | 2 +
include/linux/blk-cgroup.h | 35 ++++++++-
mm/backing-dev.c | 2 +
7 files changed, 188 insertions(+), 5 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 77f37ef8ef06..5334cb3acd22 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1351,6 +1351,136 @@ struct cgroup_subsys io_cgrp_subsys = {
};
EXPORT_SYMBOL_GPL(io_cgrp_subsys);

+#ifdef CONFIG_CGROUP_WRITEBACK
+struct blkcg_wb_sleeper {
+ struct blkcg *blkcg;
+ refcount_t refcnt;
+ struct list_head node;
+};
+
+static struct blkcg_wb_sleeper *
+blkcg_wb_sleeper_find(struct blkcg *blkcg, struct backing_dev_info *bdi)
+{
+ struct blkcg_wb_sleeper *bws;
+
+ list_for_each_entry(bws, &bdi->cgwb_waiters, node)
+ if (bws->blkcg == blkcg)
+ return bws;
+ return NULL;
+}
+
+static void
+blkcg_wb_sleeper_add(struct backing_dev_info *bdi, struct blkcg_wb_sleeper *bws)
+{
+ list_add(&bws->node, &bdi->cgwb_waiters);
+}
+
+static void
+blkcg_wb_sleeper_del(struct backing_dev_info *bdi, struct blkcg_wb_sleeper *bws)
+{
+ list_del_init(&bws->node);
+}
+
+/**
+ * blkcg_wb_waiters_on_bdi - check for writeback waiters on a block device
+ * @blkcg: current blkcg cgroup
+ * @bdi: block device to check
+ *
+ * Return true if any other blkcg different than the current one is waiting for
+ * writeback on the target block device, false otherwise.
+ */
+bool blkcg_wb_waiters_on_bdi(struct blkcg *blkcg, struct backing_dev_info *bdi)
+{
+ struct blkcg_wb_sleeper *bws;
+ bool ret = false;
+
+ if (likely(list_empty(&bdi->cgwb_waiters)))
+ return false;
+ spin_lock(&bdi->cgwb_waiters_lock);
+ list_for_each_entry(bws, &bdi->cgwb_waiters, node)
+ if (bws->blkcg != blkcg) {
+ ret = true;
+ break;
+ }
+ spin_unlock(&bdi->cgwb_waiters_lock);
+
+ return ret;
+}
+
+/**
+ * blkcg_start_wb_wait_on_bdi - add current blkcg to writeback waiters list
+ * @bdi: target block device
+ *
+ * Add current blkcg to the list of writeback waiters on target block device.
+ */
+void blkcg_start_wb_wait_on_bdi(struct backing_dev_info *bdi)
+{
+ struct blkcg_wb_sleeper *new_bws, *bws;
+ struct blkcg *blkcg;
+
+ new_bws = kzalloc(sizeof(*new_bws), GFP_KERNEL);
+ if (unlikely(!new_bws))
+ return;
+
+ rcu_read_lock();
+ blkcg = blkcg_from_current();
+ if (likely(blkcg)) {
+ /* Check if blkcg is already sleeping on bdi */
+ spin_lock_bh(&bdi->cgwb_waiters_lock);
+ bws = blkcg_wb_sleeper_find(blkcg, bdi);
+ if (bws) {
+ refcount_inc(&bws->refcnt);
+ } else {
+ /* Add current blkcg as a new wb sleeper on bdi */
+ css_get(&blkcg->css);
+ new_bws->blkcg = blkcg;
+ refcount_set(&new_bws->refcnt, 1);
+ blkcg_wb_sleeper_add(bdi, new_bws);
+ new_bws = NULL;
+ }
+ spin_unlock_bh(&bdi->cgwb_waiters_lock);
+ }
+ rcu_read_unlock();
+
+ kfree(new_bws);
+}
+
+/**
+ * blkcg_stop_wb_wait_on_bdi - remove current blkcg from writeback waiters list
+ * @bdi: target block device
+ *
+ * Remove current blkcg from the list of writeback waiters on target block
+ * device.
+ */
+void blkcg_stop_wb_wait_on_bdi(struct backing_dev_info *bdi)
+{
+ struct blkcg_wb_sleeper *bws = NULL;
+ struct blkcg *blkcg;
+
+ rcu_read_lock();
+ blkcg = blkcg_from_current();
+ if (!blkcg) {
+ rcu_read_unlock();
+ return;
+ }
+ spin_lock_bh(&bdi->cgwb_waiters_lock);
+ bws = blkcg_wb_sleeper_find(blkcg, bdi);
+ if (unlikely(!bws)) {
+ /* blkcg_start/stop_wb_wait_on_bdi() mismatch */
+ WARN_ON(1);
+ goto out_unlock;
+ }
+ if (refcount_dec_and_test(&bws->refcnt)) {
+ blkcg_wb_sleeper_del(bdi, bws);
+ css_put(&blkcg->css);
+ kfree(bws);
+ }
+out_unlock:
+ spin_unlock_bh(&bdi->cgwb_waiters_lock);
+ rcu_read_unlock();
+}
+#endif
+
/**
* blkcg_activate_policy - activate a blkcg policy on a request_queue
* @q: request_queue of interest
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1b97a73d2fb1..da817896cded 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -970,9 +970,13 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
{
bool rw = bio_data_dir(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
+ struct throtl_data *td = tg->td;
+ struct request_queue *q = td->queue;
+ struct backing_dev_info *bdi = q->backing_dev_info;
+ struct blkcg_gq *blkg = tg_to_blkg(tg);

/*
- * Currently whole state machine of group depends on first bio
+ * Currently whole state machine of group depends on first bio
* queued in the group bio list. So one should not be calling
* this function with a different bio if there are other bios
* queued.
@@ -981,8 +985,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));

/* If tg->bps = -1, then BW is unlimited */
- if (tg_bps_limit(tg, rw) == U64_MAX &&
- tg_iops_limit(tg, rw) == UINT_MAX) {
+ if (blkcg_wb_waiters_on_bdi(blkg->blkcg, bdi) ||
+ (tg_bps_limit(tg, rw) == U64_MAX &&
+ tg_iops_limit(tg, rw) == UINT_MAX)) {
if (wait)
*wait = 0;
return true;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 36855c1f8daf..77c039a0ec25 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -28,6 +28,7 @@
#include <linux/tracepoint.h>
#include <linux/device.h>
#include <linux/memcontrol.h>
+#include <linux/blk-cgroup.h>
#include "internal.h"

/*
@@ -2446,6 +2447,8 @@ void sync_inodes_sb(struct super_block *sb)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));

+ blkcg_start_wb_wait_on_bdi(bdi);
+
/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
bdi_down_write_wb_switch_rwsem(bdi);
bdi_split_work_to_wbs(bdi, &work, false);
@@ -2453,6 +2456,8 @@ void sync_inodes_sb(struct super_block *sb)
bdi_up_write_wb_switch_rwsem(bdi);

wait_sb_inodes(sb);
+
+ blkcg_stop_wb_wait_on_bdi(bdi);
}
EXPORT_SYMBOL(sync_inodes_sb);

diff --git a/fs/sync.c b/fs/sync.c
index b54e0541ad89..3958b8f98b85 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -16,6 +16,7 @@
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/backing-dev.h>
+#include <linux/blk-cgroup.h>
#include "internal.h"

#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
@@ -76,8 +77,13 @@ static void sync_inodes_one_sb(struct super_block *sb, void *arg)

static void sync_fs_one_sb(struct super_block *sb, void *arg)
{
- if (!sb_rdonly(sb) && sb->s_op->sync_fs)
+ struct backing_dev_info *bdi = sb->s_bdi;
+
+ if (!sb_rdonly(sb) && sb->s_op->sync_fs) {
+ blkcg_start_wb_wait_on_bdi(bdi);
sb->s_op->sync_fs(sb, *(int *)arg);
+ blkcg_stop_wb_wait_on_bdi(bdi);
+ }
}

static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 07e02d6df5ad..095e4dd0427b 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -191,6 +191,8 @@ struct backing_dev_info {
struct rb_root cgwb_congested_tree; /* their congested states */
struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */
struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
+ struct list_head cgwb_waiters; /* list of all waiters for writeback */
+ spinlock_t cgwb_waiters_lock; /* protect cgwb_waiters list */
#else
struct bdi_writeback_congested *wb_congested;
#endif
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 76c61318fda5..05e774d55624 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -29,6 +29,27 @@
/* Max limits for throttle policy */
#define THROTL_IOPS_MAX UINT_MAX

+struct blkcg;
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+bool blkcg_wb_waiters_on_bdi(struct blkcg *blkcg, struct backing_dev_info *bdi);
+void blkcg_start_wb_wait_on_bdi(struct backing_dev_info *bdi);
+void blkcg_stop_wb_wait_on_bdi(struct backing_dev_info *bdi);
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool
+blkcg_wb_waiters_on_bdi(struct blkcg *blkcg, struct backing_dev_info *bdi)
+{
+ return false;
+}
+
+static inline void blkcg_start_wb_wait_on_bdi(struct backing_dev_info *bdi) { }
+static inline void blkcg_stop_wb_wait_on_bdi(struct backing_dev_info *bdi) { }
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+
#ifdef CONFIG_BLK_CGROUP

enum blkg_rwstat_type {
@@ -56,6 +77,7 @@ struct blkcg {

struct list_head all_blkcgs_node;
#ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_wait_node;
struct list_head cgwb_list;
refcount_t cgwb_refcnt;
#endif
@@ -252,6 +274,12 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
return css ? container_of(css, struct blkcg, css) : NULL;
}

+static inline struct blkcg *blkcg_from_current(void)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return css_to_blkcg(blkcg_css());
+}
+
/**
* __bio_blkcg - internal, inconsistent version to get blkcg
*
@@ -463,7 +491,6 @@ static inline void blkcg_cgwb_put(struct blkcg *blkcg)
/* wb isn't being accounted, so trigger destruction right away */
blkcg_destroy_blkgs(blkcg);
}
-
#endif

/**
@@ -772,6 +799,7 @@ static inline void blkcg_bio_issue_init(struct bio *bio)
static inline bool blkcg_bio_issue_check(struct request_queue *q,
struct bio *bio)
{
+ struct backing_dev_info *bdi = q->backing_dev_info;
struct blkcg_gq *blkg;
bool throtl = false;

@@ -788,6 +816,11 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,

blkg = bio->bi_blkg;

+ local_bh_disable();
+ if (blkcg_wb_waiters_on_bdi(blkg->blkcg, bdi))
+ bio_set_flag(bio, BIO_THROTTLED);
+ local_bh_enable();
+
throtl = blk_throtl_bio(q, blkg, bio);

if (!throtl) {
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 72e6d0c55cfa..8848d26e8bf6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -686,10 +686,12 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
int ret;

+ INIT_LIST_HEAD(&bdi->cgwb_waiters);
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
init_rwsem(&bdi->wb_switch_rwsem);
+ spin_lock_init(&bdi->cgwb_waiters_lock);

ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
--
2.19.1


2019-03-10 02:08:02

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v3] blkcg: prevent priority inversion problem during sync()

Hi Andrea,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.0 next-20190306]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url: https://github.com/0day-ci/linux/commits/Andrea-Righi/blkcg-prevent-priority-inversion-problem-during-sync/20190310-020543
config: i386-randconfig-s1-201910 (attached as .config)
compiler: gcc-6 (Debian 6.5.0-2) 6.5.0 20181026
reproduce:
# save the attached .config to linux build tree
make ARCH=i386

All errors (new ones prefixed by >>):

fs/fs-writeback.c: In function 'sync_inodes_sb':
>> fs/fs-writeback.c:2450:2: error: implicit declaration of function 'blkcg_start_wb_wait_on_bdi' [-Werror=implicit-function-declaration]
blkcg_start_wb_wait_on_bdi(bdi);
^~~~~~~~~~~~~~~~~~~~~~~~~~
>> fs/fs-writeback.c:2460:2: error: implicit declaration of function 'blkcg_stop_wb_wait_on_bdi' [-Werror=implicit-function-declaration]
blkcg_stop_wb_wait_on_bdi(bdi);
^~~~~~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors

vim +/blkcg_start_wb_wait_on_bdi +2450 fs/fs-writeback.c

2419
2420 /**
2421 * sync_inodes_sb - sync sb inode pages
2422 * @sb: the superblock
2423 *
2424 * This function writes and waits on any dirty inode belonging to this
2425 * super_block.
2426 */
2427 void sync_inodes_sb(struct super_block *sb)
2428 {
2429 DEFINE_WB_COMPLETION_ONSTACK(done);
2430 struct wb_writeback_work work = {
2431 .sb = sb,
2432 .sync_mode = WB_SYNC_ALL,
2433 .nr_pages = LONG_MAX,
2434 .range_cyclic = 0,
2435 .done = &done,
2436 .reason = WB_REASON_SYNC,
2437 .for_sync = 1,
2438 };
2439 struct backing_dev_info *bdi = sb->s_bdi;
2440
2441 /*
2442 * Can't skip on !bdi_has_dirty() because we should wait for !dirty
2443 * inodes under writeback and I_DIRTY_TIME inodes ignored by
2444 * bdi_has_dirty() need to be written out too.
2445 */
2446 if (bdi == &noop_backing_dev_info)
2447 return;
2448 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2449
> 2450 blkcg_start_wb_wait_on_bdi(bdi);
2451
2452 /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
2453 bdi_down_write_wb_switch_rwsem(bdi);
2454 bdi_split_work_to_wbs(bdi, &work, false);
2455 wb_wait_for_completion(bdi, &done);
2456 bdi_up_write_wb_switch_rwsem(bdi);
2457
2458 wait_sb_inodes(sb);
2459
> 2460 blkcg_stop_wb_wait_on_bdi(bdi);
2461 }
2462 EXPORT_SYMBOL(sync_inodes_sb);
2463

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation


Attachments:
(No filename) (2.78 kB)
.config.gz (30.89 kB)
Download all attachments