From: dongdong tao <[email protected]>
Current way to calculate the writeback rate only considered the
dirty sectors, this usually works fine when the fragmentation
is not high, but it will give us unreasonable small rate when
we are under a situation that very few dirty sectors consumed
a lot dirty buckets. In some case, the dirty bucekts can reached
to CUTOFF_WRITEBACK_SYNC while the dirty data(sectors) noteven
reached the writeback_percent, the writeback rate will still
be the minimum value (4k), thus it will cause all the writes to be
stucked in a non-writeback mode because of the slow writeback.
We accelerate the rate in 3 stages with different aggressiveness,
the first stage starts when dirty buckets percent reach above
BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW (50), the second is
BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID (57), the third is
BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH (64). By default
the first stage tries to writeback the amount of dirty data
in one bucket (on average) in (1 / (dirty_buckets_percent - 50)) second,
the second stage tries to writeback the amount of dirty data in one bucket
in (1 / (dirty_buckets_percent - 57)) * 200 millisecond, the third
stage tries to writeback the amount of dirty data in one bucket in
(1 / (dirty_buckets_percent - 64)) * 20 millisecond.
Signed-off-by: dongdong tao <[email protected]>
---
drivers/md/bcache/bcache.h | 3 +++
drivers/md/bcache/sysfs.c | 18 ++++++++++++++++
drivers/md/bcache/writeback.c | 39 +++++++++++++++++++++++++++++++++++
drivers/md/bcache/writeback.h | 4 ++++
4 files changed, 64 insertions(+)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 1d57f48307e6..f8e892208bae 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -385,6 +385,9 @@ struct cached_dev {
unsigned int writeback_rate_update_seconds;
unsigned int writeback_rate_i_term_inverse;
unsigned int writeback_rate_p_term_inverse;
+ unsigned int writeback_rate_fp_term_low;
+ unsigned int writeback_rate_fp_term_mid;
+ unsigned int writeback_rate_fp_term_high;
unsigned int writeback_rate_minimum;
enum stop_on_failure stop_when_cache_set_failed;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 554e3afc9b68..130df9406171 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -121,6 +121,9 @@ rw_attribute(writeback_rate);
rw_attribute(writeback_rate_update_seconds);
rw_attribute(writeback_rate_i_term_inverse);
rw_attribute(writeback_rate_p_term_inverse);
+rw_attribute(writeback_rate_fp_term_low);
+rw_attribute(writeback_rate_fp_term_mid);
+rw_attribute(writeback_rate_fp_term_high);
rw_attribute(writeback_rate_minimum);
read_attribute(writeback_rate_debug);
@@ -205,6 +208,9 @@ SHOW(__bch_cached_dev)
var_print(writeback_rate_update_seconds);
var_print(writeback_rate_i_term_inverse);
var_print(writeback_rate_p_term_inverse);
+ var_print(writeback_rate_fp_term_low);
+ var_print(writeback_rate_fp_term_mid);
+ var_print(writeback_rate_fp_term_high);
var_print(writeback_rate_minimum);
if (attr == &sysfs_writeback_rate_debug) {
@@ -331,6 +337,15 @@ STORE(__cached_dev)
sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
dc->writeback_rate_p_term_inverse,
1, UINT_MAX);
+ sysfs_strtoul_clamp(writeback_rate_fp_term_low,
+ dc->writeback_rate_fp_term_low,
+ 1, UINT_MAX);
+ sysfs_strtoul_clamp(writeback_rate_fp_term_mid,
+ dc->writeback_rate_fp_term_mid,
+ 1, UINT_MAX);
+ sysfs_strtoul_clamp(writeback_rate_fp_term_high,
+ dc->writeback_rate_fp_term_high,
+ 1, UINT_MAX);
sysfs_strtoul_clamp(writeback_rate_minimum,
dc->writeback_rate_minimum,
1, UINT_MAX);
@@ -502,6 +517,9 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_writeback_rate_update_seconds,
&sysfs_writeback_rate_i_term_inverse,
&sysfs_writeback_rate_p_term_inverse,
+ &sysfs_writeback_rate_fp_term_low,
+ &sysfs_writeback_rate_fp_term_mid,
+ &sysfs_writeback_rate_fp_term_high,
&sysfs_writeback_rate_minimum,
&sysfs_writeback_rate_debug,
&sysfs_io_errors,
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index a129e4d2707c..a21485448e8e 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -88,6 +88,42 @@ static void __update_writeback_rate(struct cached_dev *dc)
int64_t integral_scaled;
uint32_t new_rate;
+ /*
+ * We need to consider the number of dirty buckets as well
+ * when calculating the proportional_scaled, Otherwise we might
+ * have an unreasonable small writeback rate at a highly fragmented situation
+ * when very few dirty sectors consumed a lot dirty buckets, the
+ * worst case is when dirty_data reached writeback_percent and
+ * dirty buckets reached to cutoff_writeback_sync, but the rate
+ * still will be at the minimum value, which will cause the write
+ * stuck at a non-writeback mode.
+ */
+ struct cache_set *c = dc->disk.c;
+
+ int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets;
+
+ if (c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) {
+ int64_t fragment = (dirty_buckets * c->cache->sb.bucket_size) / dirty;
+ int64_t fp_term;
+ int64_t fps;
+
+ if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) {
+ fp_term = dc->writeback_rate_fp_term_low *
+ (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW);
+ } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) {
+ fp_term = dc->writeback_rate_fp_term_mid *
+ (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID);
+ } else {
+ fp_term = dc->writeback_rate_fp_term_high *
+ (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH);
+ }
+ fps = (dirty / dirty_buckets) * fp_term;
+ if (fragment > 3 && fps > proportional_scaled) {
+ //Only overrite the p when fragment > 3
+ proportional_scaled = fps;
+ }
+ }
+
if ((error < 0 && dc->writeback_rate_integral > 0) ||
(error > 0 && time_before64(local_clock(),
dc->writeback_rate.next + NSEC_PER_MSEC))) {
@@ -984,6 +1020,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
dc->writeback_rate_p_term_inverse = 40;
+ dc->writeback_rate_fp_term_low = 1;
+ dc->writeback_rate_fp_term_mid = 5;
+ dc->writeback_rate_fp_term_high = 50;
dc->writeback_rate_i_term_inverse = 10000;
WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 3f1230e22de0..02b2f9df73f6 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -16,6 +16,10 @@
#define BCH_AUTO_GC_DIRTY_THRESHOLD 50
+#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW 50
+#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID 57
+#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH 64
+
#define BCH_DIRTY_INIT_THRD_MAX 64
/*
* 14 (16384ths) is chosen here as something that each backing device
--
2.17.1
On 1/5/21 11:44 AM, Dongdong Tao wrote:
> Hey Coly,
>
> This is the second version of the patch, please allow me to explain a
> bit for this patch:
>
> We accelerate the rate in 3 stages with different aggressiveness, the
> first stage starts when dirty buckets percent reach above
> BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW(50), the second is
> BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID(57) and the third is
> BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH(64). By default the first stage
> tries to writeback the amount of dirty data in one bucket (on average)
> in (1 / (dirty_buckets_percent - 50)) second, the second stage tries to
> writeback the amount of dirty data in one bucket in (1 /
> (dirty_buckets_percent - 57)) * 200 millisecond. The third stage tries
> to writeback the amount of dirty data in one bucket in (1 /
> (dirty_buckets_percent - 64)) * 20 millisecond.
>
> As we can see, there are two writeback aggressiveness increasing
> strategies, one strategy is with the increasing of the stage, the first
> stage is the easy-going phase whose initial rate is trying to write back
> dirty data of one bucket in 1 second, the second stage is a bit more
> aggressive, the initial rate tries to writeback the dirty data of one
> bucket in 200 ms, the last stage is even more, whose initial rate tries
> to writeback the dirty data of one bucket in 20 ms. This makes sense,
> one reason is that if the preceding stage couldn’t get the fragmentation
> to a fine stage, then the next stage should increase the aggressiveness
> properly, also it is because the later stage is closer to the
> bch_cutoff_writeback_sync. Another aggressiveness increasing strategy is
> with the increasing of dirty bucket percent within each stage, the first
> strategy controls the initial writeback rate of each stage, while this
> one increases the rate based on the initial rate, which is initial_rate
> * (dirty bucket percent - BCH_WRITEBACK_FRAGMENT_THRESHOLD_X).
>
> The initial rate can be controlled by 3 parameters
> writeback_rate_fp_term_low, writeback_rate_fp_term_mid,
> writeback_rate_fp_term_high, they are default 1, 5, 50, users can adjust
> them based on their needs.
>
> The reason that I choose 50, 57, 64 as the threshold value is because
> the GC must be triggered at least once during each stage due to the
> “sectors_to_gc” being set to 1/16 (6.25 %) of the total cache size. So,
> the hope is that the first and second stage can get us back to good
> shape in most situations by smoothly writing back the dirty data without
> giving too much stress to the backing devices, but it might still enter
> the third stage if the bucket consumption is very aggressive.
>
> This patch use (dirty / dirty_buckets) * fp_term to calculate the rate,
> this formula means that we want to writeback (dirty / dirty_buckets) in
> 1/fp_term second, fp_term is calculated by above aggressiveness
> controller, “dirty” is the current dirty sectors, “dirty_buckets” is the
> current dirty buckets, so (dirty / dirty_buckets) means the average
> dirty sectors in one bucket, the value is between 0 to 1024 for the
> default setting, so this formula basically gives a hint that to reclaim
> one bucket in 1/fp_term second. By using this semantic, we can have a
> lower writeback rate when the amount of dirty data is decreasing and
> overcome the fact that dirty buckets number is always increasing unless
> GC happens.
>
> *Compare to the first patch:
> *The first patch is trying to write back all the data in 40 seconds,
> this will result in a very high writeback rate when the amount of dirty
> data is big, this is mostly true for the large cache devices. The basic
> problem is that the semantic of this patch is not ideal, because we
> don’t really need to writeback all dirty data in order to solve this
> issue, and the instant large increase of the rate is something I feel we
> should better avoid (I like things to be smoothly changed unless no
> choice: )).
>
> Before I get to this new patch(which I believe should be optimal for me
> atm), there have been many tuning/testing iterations, eg. I’ve tried to
> tune the algorithm to writeback ⅓ of the dirty data in a certain amount
> of seconds, writeback 1/fragment of the dirty data in a certain amount
> of seconds, writeback all the dirty data only in those error_buckets
> (error buckets = dirty buckets - 50% of the total buckets) in a certain
> amount of time. However, those all turn out not to be ideal, only the
> semantic of the patch makes much sense for me and allows me to control
> the rate in a more precise way.
>
> *Testing data:
> *I'll provide the visualized testing data in the next couple of days
> with 1TB NVME devices cache but with HDD as backing device since it's
> what we mostly used in production env.
> I have the data for 400GB NVME, let me prepare it and take it for you to
> review.
[snipped]
Hi Dongdong,
Thanks for the update and continuous effort on this idea.
Please keep in mind the writeback rate is just a advice rate for the
writeback throughput, in real workload changing the writeback rate
number does not change writeback throughput obviously.
Currently I feel this is an interesting and promising idea for your
patch, but I am not able to say whether it may take effect in real
workload, so we do need convinced performance data on real workload and
configuration.
Of course I may also help on the benchmark, but my to-do list is long
enough and it may take a very long delay time.
Thanks.
Coly Li
On 1/7/21 10:55 PM, Dongdong Tao wrote:
> Hi Coly,
>
>
> Thanks for the reminder, I understand that the rate is only a hint of
> the throughput, it’s a value to calculate the sleep time between each
> round of keys writeback, the higher the rate, the shorter the sleep
> time, most of the time this means the more dirty keys it can writeback
> in a certain amount of time before the hard disk running out of speed.
>
>
> Here is the testing data that run on a 400GB NVME + 1TB NVME HDD
>
Hi Dongdong,
Nice charts :-)
> Steps:
>
> 1.
>
> make-bcache -B <HDD> -C <NVME> --writeback
>
> 2.
>
> sudo fio --name=random-writers --filename=/dev/bcache0
> --ioengine=libaio --iodepth=1 --rw=randrw --blocksize=64k,8k
> --direct=1 --numjobs=1 --write_lat_log=mix --log_avg_msec=10
> > The fio benchmark commands ran for about 20 hours.
>
The time lengths of first 3 charts are 7.000e+7, rested are 1.60930e+9.
I guess the time length of the I/O latency chart is 1/100 of the rested.
Can you also post the latency charts for 1.60930e+9 seconds? Then I can
compare the latency with dirty data and available cache charts.
Thanks.
Coly Li
>
> Let’s have a look at the write latency first:
>
> Master:
>
>
>
> Master+the patch:
>
> Combine them together:
>
> Again, the latency (y-axis) is based on nano-second, x-axis is the
> timestamp based on milli-second, as we can see the master latency is
> obviously much higher than the one with my patch when the master bcache
> hit the cutoff writeback sync, the master isn’t going to get out of this
> cutoff writeback sync situation, This graph showed it already stuck at
> the cutoff writeback sync for about 4 hours before I finish the testing,
> it may still needs to stuck for days before it can get out this
> situation itself.
>
>
> Note that there are 1 million points for each , red represents master,
> green represents mater+my patch. Most of them are overlapped with each
> other, so it may look like this graph has more red points then green
> after it hitting the cutoff, but simply it’s because the latency has
> scaled to a bigger range which represents the HDD latency.
>
>
>
> Let’s also have a look at the bcache’s cache available percent and dirty
> data percent.
>
> Master:
>
> Master+this patch:
>
> As you can see, this patch can avoid it hitting the cutoff writeback sync.
>
>
> As to say the improvement for this patch against the first one, let’s
> take a look at the writeback rate changing during the run.
>
> patch V1:
>
>
>
> Patch V2:
>
>
> The Y-axis is the value of rate, the V1 is very aggressive as it jumps
> instantly from a minimum 8 to around 10 million. And the patch V2 can
> control the rate under 5000 during the run, and after the first round of
> writeback, it can stay even under 2500, so this proves we don’t need to
> be as aggressive as V1 to get out of the high fragment situation which
> eventually causes all writes hitting the backing device. This looks very
> reasonable for me now.
>
> Note that the fio command that I used is consuming the bucket quite
> aggressively, so it had to hit the third stage which has the highest
> aggressiveness, but I believe this is not true in a real production env,
> real production env won’t consume buckets that aggressively, so I expect
> stage 3 may not very often be needed to hit.
>
>
> As discussed, I'll run multiple block size testing on at least 1TB NVME
> device later.
> But it might take some time.
>
>
> Regards,
> Dongdong
>
> On Tue, Jan 5, 2021 at 12:33 PM Coly Li <[email protected]
> <mailto:[email protected]>> wrote:
>
> On 1/5/21 11:44 AM, Dongdong Tao wrote:
> > Hey Coly,
> >
> > This is the second version of the patch, please allow me to explain a
> > bit for this patch:
> >
> > We accelerate the rate in 3 stages with different aggressiveness, the
> > first stage starts when dirty buckets percent reach above
> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW(50), the second is
> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID(57) and the third is
> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH(64). By default the first stage
> > tries to writeback the amount of dirty data in one bucket (on average)
> > in (1 / (dirty_buckets_percent - 50)) second, the second stage
> tries to
> > writeback the amount of dirty data in one bucket in (1 /
> > (dirty_buckets_percent - 57)) * 200 millisecond. The third stage tries
> > to writeback the amount of dirty data in one bucket in (1 /
> > (dirty_buckets_percent - 64)) * 20 millisecond.
> >
> > As we can see, there are two writeback aggressiveness increasing
> > strategies, one strategy is with the increasing of the stage, the
> first
> > stage is the easy-going phase whose initial rate is trying to
> write back
> > dirty data of one bucket in 1 second, the second stage is a bit more
> > aggressive, the initial rate tries to writeback the dirty data of one
> > bucket in 200 ms, the last stage is even more, whose initial rate
> tries
> > to writeback the dirty data of one bucket in 20 ms. This makes sense,
> > one reason is that if the preceding stage couldn’t get the
> fragmentation
> > to a fine stage, then the next stage should increase the
> aggressiveness
> > properly, also it is because the later stage is closer to the
> > bch_cutoff_writeback_sync. Another aggressiveness increasing
> strategy is
> > with the increasing of dirty bucket percent within each stage, the
> first
> > strategy controls the initial writeback rate of each stage, while this
> > one increases the rate based on the initial rate, which is
> initial_rate
> > * (dirty bucket percent - BCH_WRITEBACK_FRAGMENT_THRESHOLD_X).
> >
> > The initial rate can be controlled by 3 parameters
> > writeback_rate_fp_term_low, writeback_rate_fp_term_mid,
> > writeback_rate_fp_term_high, they are default 1, 5, 50, users can
> adjust
> > them based on their needs.
> >
> > The reason that I choose 50, 57, 64 as the threshold value is because
> > the GC must be triggered at least once during each stage due to the
> > “sectors_to_gc” being set to 1/16 (6.25 %) of the total cache
> size. So,
> > the hope is that the first and second stage can get us back to good
> > shape in most situations by smoothly writing back the dirty data
> without
> > giving too much stress to the backing devices, but it might still
> enter
> > the third stage if the bucket consumption is very aggressive.
> >
> > This patch use (dirty / dirty_buckets) * fp_term to calculate the
> rate,
> > this formula means that we want to writeback (dirty /
> dirty_buckets) in
> > 1/fp_term second, fp_term is calculated by above aggressiveness
> > controller, “dirty” is the current dirty sectors, “dirty_buckets”
> is the
> > current dirty buckets, so (dirty / dirty_buckets) means the average
> > dirty sectors in one bucket, the value is between 0 to 1024 for the
> > default setting, so this formula basically gives a hint that to
> reclaim
> > one bucket in 1/fp_term second. By using this semantic, we can have a
> > lower writeback rate when the amount of dirty data is decreasing and
> > overcome the fact that dirty buckets number is always increasing
> unless
> > GC happens.
> >
> > *Compare to the first patch:
> > *The first patch is trying to write back all the data in 40 seconds,
> > this will result in a very high writeback rate when the amount of
> dirty
> > data is big, this is mostly true for the large cache devices. The
> basic
> > problem is that the semantic of this patch is not ideal, because we
> > don’t really need to writeback all dirty data in order to solve this
> > issue, and the instant large increase of the rate is something I
> feel we
> > should better avoid (I like things to be smoothly changed unless no
> > choice: )).
> >
> > Before I get to this new patch(which I believe should be optimal
> for me
> > atm), there have been many tuning/testing iterations, eg. I’ve
> tried to
> > tune the algorithm to writeback ⅓ of the dirty data in a certain
> amount
> > of seconds, writeback 1/fragment of the dirty data in a certain amount
> > of seconds, writeback all the dirty data only in those error_buckets
> > (error buckets = dirty buckets - 50% of the total buckets) in a
> certain
> > amount of time. However, those all turn out not to be ideal, only the
> > semantic of the patch makes much sense for me and allows me to control
> > the rate in a more precise way.
> >
> > *Testing data:
> > *I'll provide the visualized testing data in the next couple of days
> > with 1TB NVME devices cache but with HDD as backing device since it's
> > what we mostly used in production env.
> > I have the data for 400GB NVME, let me prepare it and take it for
> you to
> > review.
> [snipped]
>
> Hi Dongdong,
>
> Thanks for the update and continuous effort on this idea.
>
> Please keep in mind the writeback rate is just a advice rate for the
> writeback throughput, in real workload changing the writeback rate
> number does not change writeback throughput obviously.
>
> Currently I feel this is an interesting and promising idea for your
> patch, but I am not able to say whether it may take effect in real
> workload, so we do need convinced performance data on real workload and
> configuration.
>
> Of course I may also help on the benchmark, but my to-do list is long
> enough and it may take a very long delay time.
>
> Thanks.
>
> Coly Li
>
Hi Coly,
They are captured with the same time length, the meaning of the
timestamp and the time unit on the x-axis are different.
(Sorry, I should have clarified this right after the chart)
For the latency chart:
The timestamp is the relative time since the beginning of the
benchmark, so the start timestamp is 0 and the unit is based on
millisecond
For the dirty data and cache available percent chart:
The timestamp is the UNIX timestamp, the time unit is based on second,
I capture the stats every 5 seconds with the below script:
---
#!/bin/sh
while true; do echo "`date +%s`, `cat
/sys/block/bcache0/bcache/dirty_data`, `cat
/sys/block/bcache0/bcache/cache/cache_available_percent`, `cat
/sys/block/bcache0/bcache/writeback_rate`" >> $1; sleep 5; done;
---
Unfortunately, I can't easily make them using the same timestamp, but
I guess I can try to convert the UNIX timestamp to the relative time
like the first one.
But If we ignore the value of the X-axis, we can still roughly
compare them by using the length of the X-axis since they have the
same time length,
and we can see that the Master's write start hitting the backing
device when the cache_available_percent dropped to around 30.
Regards,
Dongdong
On Fri, Jan 8, 2021 at 12:06 PM Coly Li <[email protected]> wrote:
>
> On 1/7/21 10:55 PM, Dongdong Tao wrote:
> > Hi Coly,
> >
> >
> > Thanks for the reminder, I understand that the rate is only a hint of
> > the throughput, it’s a value to calculate the sleep time between each
> > round of keys writeback, the higher the rate, the shorter the sleep
> > time, most of the time this means the more dirty keys it can writeback
> > in a certain amount of time before the hard disk running out of speed.
> >
> >
> > Here is the testing data that run on a 400GB NVME + 1TB NVME HDD
> >
>
> Hi Dongdong,
>
> Nice charts :-)
>
> > Steps:
> >
> > 1.
> >
> > make-bcache -B <HDD> -C <NVME> --writeback
> >
> > 2.
> >
> > sudo fio --name=random-writers --filename=/dev/bcache0
> > --ioengine=libaio --iodepth=1 --rw=randrw --blocksize=64k,8k
> > --direct=1 --numjobs=1 --write_lat_log=mix --log_avg_msec=10
> > > The fio benchmark commands ran for about 20 hours.
> >
>
> The time lengths of first 3 charts are 7.000e+7, rested are 1.60930e+9.
> I guess the time length of the I/O latency chart is 1/100 of the rested.
>
> Can you also post the latency charts for 1.60930e+9 seconds? Then I can
> compare the latency with dirty data and available cache charts.
>
>
> Thanks.
>
>
> Coly Li
>
>
>
>
>
> >
> > Let’s have a look at the write latency first:
> >
> > Master:
> >
> >
> >
> > Master+the patch:
> >
> > Combine them together:
> >
> > Again, the latency (y-axis) is based on nano-second, x-axis is the
> > timestamp based on milli-second, as we can see the master latency is
> > obviously much higher than the one with my patch when the master bcache
> > hit the cutoff writeback sync, the master isn’t going to get out of this
> > cutoff writeback sync situation, This graph showed it already stuck at
> > the cutoff writeback sync for about 4 hours before I finish the testing,
> > it may still needs to stuck for days before it can get out this
> > situation itself.
> >
> >
> > Note that there are 1 million points for each , red represents master,
> > green represents mater+my patch. Most of them are overlapped with each
> > other, so it may look like this graph has more red points then green
> > after it hitting the cutoff, but simply it’s because the latency has
> > scaled to a bigger range which represents the HDD latency.
> >
> >
> >
> > Let’s also have a look at the bcache’s cache available percent and dirty
> > data percent.
> >
> > Master:
> >
> > Master+this patch:
> >
> > As you can see, this patch can avoid it hitting the cutoff writeback sync.
> >
> >
> > As to say the improvement for this patch against the first one, let’s
> > take a look at the writeback rate changing during the run.
> >
> > patch V1:
> >
> >
> >
> > Patch V2:
> >
> >
> > The Y-axis is the value of rate, the V1 is very aggressive as it jumps
> > instantly from a minimum 8 to around 10 million. And the patch V2 can
> > control the rate under 5000 during the run, and after the first round of
> > writeback, it can stay even under 2500, so this proves we don’t need to
> > be as aggressive as V1 to get out of the high fragment situation which
> > eventually causes all writes hitting the backing device. This looks very
> > reasonable for me now.
> >
> > Note that the fio command that I used is consuming the bucket quite
> > aggressively, so it had to hit the third stage which has the highest
> > aggressiveness, but I believe this is not true in a real production env,
> > real production env won’t consume buckets that aggressively, so I expect
> > stage 3 may not very often be needed to hit.
> >
> >
> > As discussed, I'll run multiple block size testing on at least 1TB NVME
> > device later.
> > But it might take some time.
> >
> >
> > Regards,
> > Dongdong
> >
> > On Tue, Jan 5, 2021 at 12:33 PM Coly Li <[email protected]
> > <mailto:[email protected]>> wrote:
> >
> > On 1/5/21 11:44 AM, Dongdong Tao wrote:
> > > Hey Coly,
> > >
> > > This is the second version of the patch, please allow me to explain a
> > > bit for this patch:
> > >
> > > We accelerate the rate in 3 stages with different aggressiveness, the
> > > first stage starts when dirty buckets percent reach above
> > > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW(50), the second is
> > > BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID(57) and the third is
> > > BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH(64). By default the first stage
> > > tries to writeback the amount of dirty data in one bucket (on average)
> > > in (1 / (dirty_buckets_percent - 50)) second, the second stage
> > tries to
> > > writeback the amount of dirty data in one bucket in (1 /
> > > (dirty_buckets_percent - 57)) * 200 millisecond. The third stage tries
> > > to writeback the amount of dirty data in one bucket in (1 /
> > > (dirty_buckets_percent - 64)) * 20 millisecond.
> > >
> > > As we can see, there are two writeback aggressiveness increasing
> > > strategies, one strategy is with the increasing of the stage, the
> > first
> > > stage is the easy-going phase whose initial rate is trying to
> > write back
> > > dirty data of one bucket in 1 second, the second stage is a bit more
> > > aggressive, the initial rate tries to writeback the dirty data of one
> > > bucket in 200 ms, the last stage is even more, whose initial rate
> > tries
> > > to writeback the dirty data of one bucket in 20 ms. This makes sense,
> > > one reason is that if the preceding stage couldn’t get the
> > fragmentation
> > > to a fine stage, then the next stage should increase the
> > aggressiveness
> > > properly, also it is because the later stage is closer to the
> > > bch_cutoff_writeback_sync. Another aggressiveness increasing
> > strategy is
> > > with the increasing of dirty bucket percent within each stage, the
> > first
> > > strategy controls the initial writeback rate of each stage, while this
> > > one increases the rate based on the initial rate, which is
> > initial_rate
> > > * (dirty bucket percent - BCH_WRITEBACK_FRAGMENT_THRESHOLD_X).
> > >
> > > The initial rate can be controlled by 3 parameters
> > > writeback_rate_fp_term_low, writeback_rate_fp_term_mid,
> > > writeback_rate_fp_term_high, they are default 1, 5, 50, users can
> > adjust
> > > them based on their needs.
> > >
> > > The reason that I choose 50, 57, 64 as the threshold value is because
> > > the GC must be triggered at least once during each stage due to the
> > > “sectors_to_gc” being set to 1/16 (6.25 %) of the total cache
> > size. So,
> > > the hope is that the first and second stage can get us back to good
> > > shape in most situations by smoothly writing back the dirty data
> > without
> > > giving too much stress to the backing devices, but it might still
> > enter
> > > the third stage if the bucket consumption is very aggressive.
> > >
> > > This patch use (dirty / dirty_buckets) * fp_term to calculate the
> > rate,
> > > this formula means that we want to writeback (dirty /
> > dirty_buckets) in
> > > 1/fp_term second, fp_term is calculated by above aggressiveness
> > > controller, “dirty” is the current dirty sectors, “dirty_buckets”
> > is the
> > > current dirty buckets, so (dirty / dirty_buckets) means the average
> > > dirty sectors in one bucket, the value is between 0 to 1024 for the
> > > default setting, so this formula basically gives a hint that to
> > reclaim
> > > one bucket in 1/fp_term second. By using this semantic, we can have a
> > > lower writeback rate when the amount of dirty data is decreasing and
> > > overcome the fact that dirty buckets number is always increasing
> > unless
> > > GC happens.
> > >
> > > *Compare to the first patch:
> > > *The first patch is trying to write back all the data in 40 seconds,
> > > this will result in a very high writeback rate when the amount of
> > dirty
> > > data is big, this is mostly true for the large cache devices. The
> > basic
> > > problem is that the semantic of this patch is not ideal, because we
> > > don’t really need to writeback all dirty data in order to solve this
> > > issue, and the instant large increase of the rate is something I
> > feel we
> > > should better avoid (I like things to be smoothly changed unless no
> > > choice: )).
> > >
> > > Before I get to this new patch(which I believe should be optimal
> > for me
> > > atm), there have been many tuning/testing iterations, eg. I’ve
> > tried to
> > > tune the algorithm to writeback ⅓ of the dirty data in a certain
> > amount
> > > of seconds, writeback 1/fragment of the dirty data in a certain amount
> > > of seconds, writeback all the dirty data only in those error_buckets
> > > (error buckets = dirty buckets - 50% of the total buckets) in a
> > certain
> > > amount of time. However, those all turn out not to be ideal, only the
> > > semantic of the patch makes much sense for me and allows me to control
> > > the rate in a more precise way.
> > >
> > > *Testing data:
> > > *I'll provide the visualized testing data in the next couple of days
> > > with 1TB NVME devices cache but with HDD as backing device since it's
> > > what we mostly used in production env.
> > > I have the data for 400GB NVME, let me prepare it and take it for
> > you to
> > > review.
> > [snipped]
> >
> > Hi Dongdong,
> >
> > Thanks for the update and continuous effort on this idea.
> >
> > Please keep in mind the writeback rate is just a advice rate for the
> > writeback throughput, in real workload changing the writeback rate
> > number does not change writeback throughput obviously.
> >
> > Currently I feel this is an interesting and promising idea for your
> > patch, but I am not able to say whether it may take effect in real
> > workload, so we do need convinced performance data on real workload and
> > configuration.
> >
> > Of course I may also help on the benchmark, but my to-do list is long
> > enough and it may take a very long delay time.
> >
> > Thanks.
> >
> > Coly Li
> >
>
On 1/8/21 4:30 PM, Dongdong Tao wrote:
> Hi Coly,
>
> They are captured with the same time length, the meaning of the
> timestamp and the time unit on the x-axis are different.
> (Sorry, I should have clarified this right after the chart)
>
> For the latency chart:
> The timestamp is the relative time since the beginning of the
> benchmark, so the start timestamp is 0 and the unit is based on
> millisecond
>
> For the dirty data and cache available percent chart:
> The timestamp is the UNIX timestamp, the time unit is based on second,
> I capture the stats every 5 seconds with the below script:
> ---
> #!/bin/sh
> while true; do echo "`date +%s`, `cat
> /sys/block/bcache0/bcache/dirty_data`, `cat
> /sys/block/bcache0/bcache/cache/cache_available_percent`, `cat
> /sys/block/bcache0/bcache/writeback_rate`" >> $1; sleep 5; done;
> ---
>
> Unfortunately, I can't easily make them using the same timestamp, but
> I guess I can try to convert the UNIX timestamp to the relative time
> like the first one.
> But If we ignore the value of the X-axis, we can still roughly
> compare them by using the length of the X-axis since they have the
> same time length,
> and we can see that the Master's write start hitting the backing
> device when the cache_available_percent dropped to around 30.
Copied, thanks for the explanation. The chart for single thread with io
depth 1 is convinced IMHO :-)
One more question, the benchmark is about a single I/O thread with io
depth 1, which is not typical condition for real workload. Do you have
plan to test the latency and IOPS for multiple threads with larger I/O
depth ?
Thanks.
Coly Li
>
> On Fri, Jan 8, 2021 at 12:06 PM Coly Li <[email protected]> wrote:
>>
>> On 1/7/21 10:55 PM, Dongdong Tao wrote:
>>> Hi Coly,
>>>
>>>
>>> Thanks for the reminder, I understand that the rate is only a hint of
>>> the throughput, it’s a value to calculate the sleep time between each
>>> round of keys writeback, the higher the rate, the shorter the sleep
>>> time, most of the time this means the more dirty keys it can writeback
>>> in a certain amount of time before the hard disk running out of speed.
>>>
>>>
>>> Here is the testing data that run on a 400GB NVME + 1TB NVME HDD
>>>
>>
>> Hi Dongdong,
>>
>> Nice charts :-)
>>
>>> Steps:
>>>
>>> 1.
>>>
>>> make-bcache -B <HDD> -C <NVME> --writeback
>>>
>>> 2.
>>>
>>> sudo fio --name=random-writers --filename=/dev/bcache0
>>> --ioengine=libaio --iodepth=1 --rw=randrw --blocksize=64k,8k
>>> --direct=1 --numjobs=1 --write_lat_log=mix --log_avg_msec=10
>>>> The fio benchmark commands ran for about 20 hours.
>>>
>>
>> The time lengths of first 3 charts are 7.000e+7, rested are 1.60930e+9.
>> I guess the time length of the I/O latency chart is 1/100 of the rested.
>>
>> Can you also post the latency charts for 1.60930e+9 seconds? Then I can
>> compare the latency with dirty data and available cache charts.
>>
>>
>> Thanks.
>>
>>
>> Coly Li
>>
>>
>>
>>
>>
>>>
>>> Let’s have a look at the write latency first:
>>>
>>> Master:
>>>
>>>
>>>
>>> Master+the patch:
>>>
>>> Combine them together:
>>>
>>> Again, the latency (y-axis) is based on nano-second, x-axis is the
>>> timestamp based on milli-second, as we can see the master latency is
>>> obviously much higher than the one with my patch when the master bcache
>>> hit the cutoff writeback sync, the master isn’t going to get out of this
>>> cutoff writeback sync situation, This graph showed it already stuck at
>>> the cutoff writeback sync for about 4 hours before I finish the testing,
>>> it may still needs to stuck for days before it can get out this
>>> situation itself.
>>>
>>>
>>> Note that there are 1 million points for each , red represents master,
>>> green represents mater+my patch. Most of them are overlapped with each
>>> other, so it may look like this graph has more red points then green
>>> after it hitting the cutoff, but simply it’s because the latency has
>>> scaled to a bigger range which represents the HDD latency.
>>>
>>>
>>>
>>> Let’s also have a look at the bcache’s cache available percent and dirty
>>> data percent.
>>>
>>> Master:
>>>
>>> Master+this patch:
>>>
>>> As you can see, this patch can avoid it hitting the cutoff writeback sync.
>>>
>>>
>>> As to say the improvement for this patch against the first one, let’s
>>> take a look at the writeback rate changing during the run.
>>>
>>> patch V1:
>>>
>>>
>>>
>>> Patch V2:
>>>
>>>
>>> The Y-axis is the value of rate, the V1 is very aggressive as it jumps
>>> instantly from a minimum 8 to around 10 million. And the patch V2 can
>>> control the rate under 5000 during the run, and after the first round of
>>> writeback, it can stay even under 2500, so this proves we don’t need to
>>> be as aggressive as V1 to get out of the high fragment situation which
>>> eventually causes all writes hitting the backing device. This looks very
>>> reasonable for me now.
>>>
>>> Note that the fio command that I used is consuming the bucket quite
>>> aggressively, so it had to hit the third stage which has the highest
>>> aggressiveness, but I believe this is not true in a real production env,
>>> real production env won’t consume buckets that aggressively, so I expect
>>> stage 3 may not very often be needed to hit.
>>>
>>>
>>> As discussed, I'll run multiple block size testing on at least 1TB NVME
>>> device later.
>>> But it might take some time.
>>>
>>>
>>> Regards,
>>> Dongdong
>>>
>>> On Tue, Jan 5, 2021 at 12:33 PM Coly Li <[email protected]
>>> <mailto:[email protected]>> wrote:
>>>
>>> On 1/5/21 11:44 AM, Dongdong Tao wrote:
>>> > Hey Coly,
>>> >
>>> > This is the second version of the patch, please allow me to explain a
>>> > bit for this patch:
>>> >
>>> > We accelerate the rate in 3 stages with different aggressiveness, the
>>> > first stage starts when dirty buckets percent reach above
>>> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW(50), the second is
>>> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID(57) and the third is
>>> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH(64). By default the first stage
>>> > tries to writeback the amount of dirty data in one bucket (on average)
>>> > in (1 / (dirty_buckets_percent - 50)) second, the second stage
>>> tries to
>>> > writeback the amount of dirty data in one bucket in (1 /
>>> > (dirty_buckets_percent - 57)) * 200 millisecond. The third stage tries
>>> > to writeback the amount of dirty data in one bucket in (1 /
>>> > (dirty_buckets_percent - 64)) * 20 millisecond.
>>> >
>>> > As we can see, there are two writeback aggressiveness increasing
>>> > strategies, one strategy is with the increasing of the stage, the
>>> first
>>> > stage is the easy-going phase whose initial rate is trying to
>>> write back
>>> > dirty data of one bucket in 1 second, the second stage is a bit more
>>> > aggressive, the initial rate tries to writeback the dirty data of one
>>> > bucket in 200 ms, the last stage is even more, whose initial rate
>>> tries
>>> > to writeback the dirty data of one bucket in 20 ms. This makes sense,
>>> > one reason is that if the preceding stage couldn’t get the
>>> fragmentation
>>> > to a fine stage, then the next stage should increase the
>>> aggressiveness
>>> > properly, also it is because the later stage is closer to the
>>> > bch_cutoff_writeback_sync. Another aggressiveness increasing
>>> strategy is
>>> > with the increasing of dirty bucket percent within each stage, the
>>> first
>>> > strategy controls the initial writeback rate of each stage, while this
>>> > one increases the rate based on the initial rate, which is
>>> initial_rate
>>> > * (dirty bucket percent - BCH_WRITEBACK_FRAGMENT_THRESHOLD_X).
>>> >
>>> > The initial rate can be controlled by 3 parameters
>>> > writeback_rate_fp_term_low, writeback_rate_fp_term_mid,
>>> > writeback_rate_fp_term_high, they are default 1, 5, 50, users can
>>> adjust
>>> > them based on their needs.
>>> >
>>> > The reason that I choose 50, 57, 64 as the threshold value is because
>>> > the GC must be triggered at least once during each stage due to the
>>> > “sectors_to_gc” being set to 1/16 (6.25 %) of the total cache
>>> size. So,
>>> > the hope is that the first and second stage can get us back to good
>>> > shape in most situations by smoothly writing back the dirty data
>>> without
>>> > giving too much stress to the backing devices, but it might still
>>> enter
>>> > the third stage if the bucket consumption is very aggressive.
>>> >
>>> > This patch use (dirty / dirty_buckets) * fp_term to calculate the
>>> rate,
>>> > this formula means that we want to writeback (dirty /
>>> dirty_buckets) in
>>> > 1/fp_term second, fp_term is calculated by above aggressiveness
>>> > controller, “dirty” is the current dirty sectors, “dirty_buckets”
>>> is the
>>> > current dirty buckets, so (dirty / dirty_buckets) means the average
>>> > dirty sectors in one bucket, the value is between 0 to 1024 for the
>>> > default setting, so this formula basically gives a hint that to
>>> reclaim
>>> > one bucket in 1/fp_term second. By using this semantic, we can have a
>>> > lower writeback rate when the amount of dirty data is decreasing and
>>> > overcome the fact that dirty buckets number is always increasing
>>> unless
>>> > GC happens.
>>> >
>>> > *Compare to the first patch:
>>> > *The first patch is trying to write back all the data in 40 seconds,
>>> > this will result in a very high writeback rate when the amount of
>>> dirty
>>> > data is big, this is mostly true for the large cache devices. The
>>> basic
>>> > problem is that the semantic of this patch is not ideal, because we
>>> > don’t really need to writeback all dirty data in order to solve this
>>> > issue, and the instant large increase of the rate is something I
>>> feel we
>>> > should better avoid (I like things to be smoothly changed unless no
>>> > choice: )).
>>> >
>>> > Before I get to this new patch(which I believe should be optimal
>>> for me
>>> > atm), there have been many tuning/testing iterations, eg. I’ve
>>> tried to
>>> > tune the algorithm to writeback ⅓ of the dirty data in a certain
>>> amount
>>> > of seconds, writeback 1/fragment of the dirty data in a certain amount
>>> > of seconds, writeback all the dirty data only in those error_buckets
>>> > (error buckets = dirty buckets - 50% of the total buckets) in a
>>> certain
>>> > amount of time. However, those all turn out not to be ideal, only the
>>> > semantic of the patch makes much sense for me and allows me to control
>>> > the rate in a more precise way.
>>> >
>>> > *Testing data:
>>> > *I'll provide the visualized testing data in the next couple of days
>>> > with 1TB NVME devices cache but with HDD as backing device since it's
>>> > what we mostly used in production env.
>>> > I have the data for 400GB NVME, let me prepare it and take it for
>>> you to
>>> > review.
>>> [snipped]
>>>
>>> Hi Dongdong,
>>>
>>> Thanks for the update and continuous effort on this idea.
>>>
>>> Please keep in mind the writeback rate is just a advice rate for the
>>> writeback throughput, in real workload changing the writeback rate
>>> number does not change writeback throughput obviously.
>>>
>>> Currently I feel this is an interesting and promising idea for your
>>> patch, but I am not able to say whether it may take effect in real
>>> workload, so we do need convinced performance data on real workload and
>>> configuration.
>>>
>>> Of course I may also help on the benchmark, but my to-do list is long
>>> enough and it may take a very long delay time.
>>>
>>> Thanks.
>>>
>>> Coly Li
>>>
>>
Yeap, I will scale the testing for multiple threads with larger IO
depth, thanks for the suggestion!
On Fri, Jan 8, 2021 at 4:40 PM Coly Li <[email protected]> wrote:
>
> On 1/8/21 4:30 PM, Dongdong Tao wrote:
> > Hi Coly,
> >
> > They are captured with the same time length, the meaning of the
> > timestamp and the time unit on the x-axis are different.
> > (Sorry, I should have clarified this right after the chart)
> >
> > For the latency chart:
> > The timestamp is the relative time since the beginning of the
> > benchmark, so the start timestamp is 0 and the unit is based on
> > millisecond
> >
> > For the dirty data and cache available percent chart:
> > The timestamp is the UNIX timestamp, the time unit is based on second,
> > I capture the stats every 5 seconds with the below script:
> > ---
> > #!/bin/sh
> > while true; do echo "`date +%s`, `cat
> > /sys/block/bcache0/bcache/dirty_data`, `cat
> > /sys/block/bcache0/bcache/cache/cache_available_percent`, `cat
> > /sys/block/bcache0/bcache/writeback_rate`" >> $1; sleep 5; done;
> > ---
> >
> > Unfortunately, I can't easily make them using the same timestamp, but
> > I guess I can try to convert the UNIX timestamp to the relative time
> > like the first one.
> > But If we ignore the value of the X-axis, we can still roughly
> > compare them by using the length of the X-axis since they have the
> > same time length,
> > and we can see that the Master's write start hitting the backing
> > device when the cache_available_percent dropped to around 30.
>
> Copied, thanks for the explanation. The chart for single thread with io
> depth 1 is convinced IMHO :-)
>
> One more question, the benchmark is about a single I/O thread with io
> depth 1, which is not typical condition for real workload. Do you have
> plan to test the latency and IOPS for multiple threads with larger I/O
> depth ?
>
>
> Thanks.
>
>
> Coly Li
>
>
> >
> > On Fri, Jan 8, 2021 at 12:06 PM Coly Li <[email protected]> wrote:
> >>
> >> On 1/7/21 10:55 PM, Dongdong Tao wrote:
> >>> Hi Coly,
> >>>
> >>>
> >>> Thanks for the reminder, I understand that the rate is only a hint of
> >>> the throughput, it’s a value to calculate the sleep time between each
> >>> round of keys writeback, the higher the rate, the shorter the sleep
> >>> time, most of the time this means the more dirty keys it can writeback
> >>> in a certain amount of time before the hard disk running out of speed.
> >>>
> >>>
> >>> Here is the testing data that run on a 400GB NVME + 1TB NVME HDD
> >>>
> >>
> >> Hi Dongdong,
> >>
> >> Nice charts :-)
> >>
> >>> Steps:
> >>>
> >>> 1.
> >>>
> >>> make-bcache -B <HDD> -C <NVME> --writeback
> >>>
> >>> 2.
> >>>
> >>> sudo fio --name=random-writers --filename=/dev/bcache0
> >>> --ioengine=libaio --iodepth=1 --rw=randrw --blocksize=64k,8k
> >>> --direct=1 --numjobs=1 --write_lat_log=mix --log_avg_msec=10
> >>>> The fio benchmark commands ran for about 20 hours.
> >>>
> >>
> >> The time lengths of first 3 charts are 7.000e+7, rested are 1.60930e+9.
> >> I guess the time length of the I/O latency chart is 1/100 of the rested.
> >>
> >> Can you also post the latency charts for 1.60930e+9 seconds? Then I can
> >> compare the latency with dirty data and available cache charts.
> >>
> >>
> >> Thanks.
> >>
> >>
> >> Coly Li
> >>
> >>
> >>
> >>
> >>
> >>>
> >>> Let’s have a look at the write latency first:
> >>>
> >>> Master:
> >>>
> >>>
> >>>
> >>> Master+the patch:
> >>>
> >>> Combine them together:
> >>>
> >>> Again, the latency (y-axis) is based on nano-second, x-axis is the
> >>> timestamp based on milli-second, as we can see the master latency is
> >>> obviously much higher than the one with my patch when the master bcache
> >>> hit the cutoff writeback sync, the master isn’t going to get out of this
> >>> cutoff writeback sync situation, This graph showed it already stuck at
> >>> the cutoff writeback sync for about 4 hours before I finish the testing,
> >>> it may still needs to stuck for days before it can get out this
> >>> situation itself.
> >>>
> >>>
> >>> Note that there are 1 million points for each , red represents master,
> >>> green represents mater+my patch. Most of them are overlapped with each
> >>> other, so it may look like this graph has more red points then green
> >>> after it hitting the cutoff, but simply it’s because the latency has
> >>> scaled to a bigger range which represents the HDD latency.
> >>>
> >>>
> >>>
> >>> Let’s also have a look at the bcache’s cache available percent and dirty
> >>> data percent.
> >>>
> >>> Master:
> >>>
> >>> Master+this patch:
> >>>
> >>> As you can see, this patch can avoid it hitting the cutoff writeback sync.
> >>>
> >>>
> >>> As to say the improvement for this patch against the first one, let’s
> >>> take a look at the writeback rate changing during the run.
> >>>
> >>> patch V1:
> >>>
> >>>
> >>>
> >>> Patch V2:
> >>>
> >>>
> >>> The Y-axis is the value of rate, the V1 is very aggressive as it jumps
> >>> instantly from a minimum 8 to around 10 million. And the patch V2 can
> >>> control the rate under 5000 during the run, and after the first round of
> >>> writeback, it can stay even under 2500, so this proves we don’t need to
> >>> be as aggressive as V1 to get out of the high fragment situation which
> >>> eventually causes all writes hitting the backing device. This looks very
> >>> reasonable for me now.
> >>>
> >>> Note that the fio command that I used is consuming the bucket quite
> >>> aggressively, so it had to hit the third stage which has the highest
> >>> aggressiveness, but I believe this is not true in a real production env,
> >>> real production env won’t consume buckets that aggressively, so I expect
> >>> stage 3 may not very often be needed to hit.
> >>>
> >>>
> >>> As discussed, I'll run multiple block size testing on at least 1TB NVME
> >>> device later.
> >>> But it might take some time.
> >>>
> >>>
> >>> Regards,
> >>> Dongdong
> >>>
> >>> On Tue, Jan 5, 2021 at 12:33 PM Coly Li <[email protected]
> >>> <mailto:[email protected]>> wrote:
> >>>
> >>> On 1/5/21 11:44 AM, Dongdong Tao wrote:
> >>> > Hey Coly,
> >>> >
> >>> > This is the second version of the patch, please allow me to explain a
> >>> > bit for this patch:
> >>> >
> >>> > We accelerate the rate in 3 stages with different aggressiveness, the
> >>> > first stage starts when dirty buckets percent reach above
> >>> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW(50), the second is
> >>> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID(57) and the third is
> >>> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH(64). By default the first stage
> >>> > tries to writeback the amount of dirty data in one bucket (on average)
> >>> > in (1 / (dirty_buckets_percent - 50)) second, the second stage
> >>> tries to
> >>> > writeback the amount of dirty data in one bucket in (1 /
> >>> > (dirty_buckets_percent - 57)) * 200 millisecond. The third stage tries
> >>> > to writeback the amount of dirty data in one bucket in (1 /
> >>> > (dirty_buckets_percent - 64)) * 20 millisecond.
> >>> >
> >>> > As we can see, there are two writeback aggressiveness increasing
> >>> > strategies, one strategy is with the increasing of the stage, the
> >>> first
> >>> > stage is the easy-going phase whose initial rate is trying to
> >>> write back
> >>> > dirty data of one bucket in 1 second, the second stage is a bit more
> >>> > aggressive, the initial rate tries to writeback the dirty data of one
> >>> > bucket in 200 ms, the last stage is even more, whose initial rate
> >>> tries
> >>> > to writeback the dirty data of one bucket in 20 ms. This makes sense,
> >>> > one reason is that if the preceding stage couldn’t get the
> >>> fragmentation
> >>> > to a fine stage, then the next stage should increase the
> >>> aggressiveness
> >>> > properly, also it is because the later stage is closer to the
> >>> > bch_cutoff_writeback_sync. Another aggressiveness increasing
> >>> strategy is
> >>> > with the increasing of dirty bucket percent within each stage, the
> >>> first
> >>> > strategy controls the initial writeback rate of each stage, while this
> >>> > one increases the rate based on the initial rate, which is
> >>> initial_rate
> >>> > * (dirty bucket percent - BCH_WRITEBACK_FRAGMENT_THRESHOLD_X).
> >>> >
> >>> > The initial rate can be controlled by 3 parameters
> >>> > writeback_rate_fp_term_low, writeback_rate_fp_term_mid,
> >>> > writeback_rate_fp_term_high, they are default 1, 5, 50, users can
> >>> adjust
> >>> > them based on their needs.
> >>> >
> >>> > The reason that I choose 50, 57, 64 as the threshold value is because
> >>> > the GC must be triggered at least once during each stage due to the
> >>> > “sectors_to_gc” being set to 1/16 (6.25 %) of the total cache
> >>> size. So,
> >>> > the hope is that the first and second stage can get us back to good
> >>> > shape in most situations by smoothly writing back the dirty data
> >>> without
> >>> > giving too much stress to the backing devices, but it might still
> >>> enter
> >>> > the third stage if the bucket consumption is very aggressive.
> >>> >
> >>> > This patch use (dirty / dirty_buckets) * fp_term to calculate the
> >>> rate,
> >>> > this formula means that we want to writeback (dirty /
> >>> dirty_buckets) in
> >>> > 1/fp_term second, fp_term is calculated by above aggressiveness
> >>> > controller, “dirty” is the current dirty sectors, “dirty_buckets”
> >>> is the
> >>> > current dirty buckets, so (dirty / dirty_buckets) means the average
> >>> > dirty sectors in one bucket, the value is between 0 to 1024 for the
> >>> > default setting, so this formula basically gives a hint that to
> >>> reclaim
> >>> > one bucket in 1/fp_term second. By using this semantic, we can have a
> >>> > lower writeback rate when the amount of dirty data is decreasing and
> >>> > overcome the fact that dirty buckets number is always increasing
> >>> unless
> >>> > GC happens.
> >>> >
> >>> > *Compare to the first patch:
> >>> > *The first patch is trying to write back all the data in 40 seconds,
> >>> > this will result in a very high writeback rate when the amount of
> >>> dirty
> >>> > data is big, this is mostly true for the large cache devices. The
> >>> basic
> >>> > problem is that the semantic of this patch is not ideal, because we
> >>> > don’t really need to writeback all dirty data in order to solve this
> >>> > issue, and the instant large increase of the rate is something I
> >>> feel we
> >>> > should better avoid (I like things to be smoothly changed unless no
> >>> > choice: )).
> >>> >
> >>> > Before I get to this new patch(which I believe should be optimal
> >>> for me
> >>> > atm), there have been many tuning/testing iterations, eg. I’ve
> >>> tried to
> >>> > tune the algorithm to writeback ⅓ of the dirty data in a certain
> >>> amount
> >>> > of seconds, writeback 1/fragment of the dirty data in a certain amount
> >>> > of seconds, writeback all the dirty data only in those error_buckets
> >>> > (error buckets = dirty buckets - 50% of the total buckets) in a
> >>> certain
> >>> > amount of time. However, those all turn out not to be ideal, only the
> >>> > semantic of the patch makes much sense for me and allows me to control
> >>> > the rate in a more precise way.
> >>> >
> >>> > *Testing data:
> >>> > *I'll provide the visualized testing data in the next couple of days
> >>> > with 1TB NVME devices cache but with HDD as backing device since it's
> >>> > what we mostly used in production env.
> >>> > I have the data for 400GB NVME, let me prepare it and take it for
> >>> you to
> >>> > review.
> >>> [snipped]
> >>>
> >>> Hi Dongdong,
> >>>
> >>> Thanks for the update and continuous effort on this idea.
> >>>
> >>> Please keep in mind the writeback rate is just a advice rate for the
> >>> writeback throughput, in real workload changing the writeback rate
> >>> number does not change writeback throughput obviously.
> >>>
> >>> Currently I feel this is an interesting and promising idea for your
> >>> patch, but I am not able to say whether it may take effect in real
> >>> workload, so we do need convinced performance data on real workload and
> >>> configuration.
> >>>
> >>> Of course I may also help on the benchmark, but my to-do list is long
> >>> enough and it may take a very long delay time.
> >>>
> >>> Thanks.
> >>>
> >>> Coly Li
> >>>
> >>
>
[Share the google doc here to avoid SPAM detection]
Here is the new testing result with multiple threads fio testing:
https://docs.google.com/document/d/1AmbIEa_2MhB9bqhC3rfga9tp7n9YX9PLn0jSUxscVW0/edit?usp=sharing
On Fri, Jan 8, 2021 at 4:47 PM Dongdong Tao <[email protected]> wrote:
>
> Yeap, I will scale the testing for multiple threads with larger IO
> depth, thanks for the suggestion!
>
> On Fri, Jan 8, 2021 at 4:40 PM Coly Li <[email protected]> wrote:
> >
> > On 1/8/21 4:30 PM, Dongdong Tao wrote:
> > > Hi Coly,
> > >
> > > They are captured with the same time length, the meaning of the
> > > timestamp and the time unit on the x-axis are different.
> > > (Sorry, I should have clarified this right after the chart)
> > >
> > > For the latency chart:
> > > The timestamp is the relative time since the beginning of the
> > > benchmark, so the start timestamp is 0 and the unit is based on
> > > millisecond
> > >
> > > For the dirty data and cache available percent chart:
> > > The timestamp is the UNIX timestamp, the time unit is based on second,
> > > I capture the stats every 5 seconds with the below script:
> > > ---
> > > #!/bin/sh
> > > while true; do echo "`date +%s`, `cat
> > > /sys/block/bcache0/bcache/dirty_data`, `cat
> > > /sys/block/bcache0/bcache/cache/cache_available_percent`, `cat
> > > /sys/block/bcache0/bcache/writeback_rate`" >> $1; sleep 5; done;
> > > ---
> > >
> > > Unfortunately, I can't easily make them using the same timestamp, but
> > > I guess I can try to convert the UNIX timestamp to the relative time
> > > like the first one.
> > > But If we ignore the value of the X-axis, we can still roughly
> > > compare them by using the length of the X-axis since they have the
> > > same time length,
> > > and we can see that the Master's write start hitting the backing
> > > device when the cache_available_percent dropped to around 30.
> >
> > Copied, thanks for the explanation. The chart for single thread with io
> > depth 1 is convinced IMHO :-)
> >
> > One more question, the benchmark is about a single I/O thread with io
> > depth 1, which is not typical condition for real workload. Do you have
> > plan to test the latency and IOPS for multiple threads with larger I/O
> > depth ?
> >
> >
> > Thanks.
> >
> >
> > Coly Li
> >
> >
> > >
> > > On Fri, Jan 8, 2021 at 12:06 PM Coly Li <[email protected]> wrote:
> > >>
> > >> On 1/7/21 10:55 PM, Dongdong Tao wrote:
> > >>> Hi Coly,
> > >>>
> > >>>
> > >>> Thanks for the reminder, I understand that the rate is only a hint of
> > >>> the throughput, it’s a value to calculate the sleep time between each
> > >>> round of keys writeback, the higher the rate, the shorter the sleep
> > >>> time, most of the time this means the more dirty keys it can writeback
> > >>> in a certain amount of time before the hard disk running out of speed.
> > >>>
> > >>>
> > >>> Here is the testing data that run on a 400GB NVME + 1TB NVME HDD
> > >>>
> > >>
> > >> Hi Dongdong,
> > >>
> > >> Nice charts :-)
> > >>
> > >>> Steps:
> > >>>
> > >>> 1.
> > >>>
> > >>> make-bcache -B <HDD> -C <NVME> --writeback
> > >>>
> > >>> 2.
> > >>>
> > >>> sudo fio --name=random-writers --filename=/dev/bcache0
> > >>> --ioengine=libaio --iodepth=1 --rw=randrw --blocksize=64k,8k
> > >>> --direct=1 --numjobs=1 --write_lat_log=mix --log_avg_msec=10
> > >>>> The fio benchmark commands ran for about 20 hours.
> > >>>
> > >>
> > >> The time lengths of first 3 charts are 7.000e+7, rested are 1.60930e+9.
> > >> I guess the time length of the I/O latency chart is 1/100 of the rested.
> > >>
> > >> Can you also post the latency charts for 1.60930e+9 seconds? Then I can
> > >> compare the latency with dirty data and available cache charts.
> > >>
> > >>
> > >> Thanks.
> > >>
> > >>
> > >> Coly Li
> > >>
> > >>
> > >>
> > >>
> > >>
> > >>>
> > >>> Let’s have a look at the write latency first:
> > >>>
> > >>> Master:
> > >>>
> > >>>
> > >>>
> > >>> Master+the patch:
> > >>>
> > >>> Combine them together:
> > >>>
> > >>> Again, the latency (y-axis) is based on nano-second, x-axis is the
> > >>> timestamp based on milli-second, as we can see the master latency is
> > >>> obviously much higher than the one with my patch when the master bcache
> > >>> hit the cutoff writeback sync, the master isn’t going to get out of this
> > >>> cutoff writeback sync situation, This graph showed it already stuck at
> > >>> the cutoff writeback sync for about 4 hours before I finish the testing,
> > >>> it may still needs to stuck for days before it can get out this
> > >>> situation itself.
> > >>>
> > >>>
> > >>> Note that there are 1 million points for each , red represents master,
> > >>> green represents mater+my patch. Most of them are overlapped with each
> > >>> other, so it may look like this graph has more red points then green
> > >>> after it hitting the cutoff, but simply it’s because the latency has
> > >>> scaled to a bigger range which represents the HDD latency.
> > >>>
> > >>>
> > >>>
> > >>> Let’s also have a look at the bcache’s cache available percent and dirty
> > >>> data percent.
> > >>>
> > >>> Master:
> > >>>
> > >>> Master+this patch:
> > >>>
> > >>> As you can see, this patch can avoid it hitting the cutoff writeback sync.
> > >>>
> > >>>
> > >>> As to say the improvement for this patch against the first one, let’s
> > >>> take a look at the writeback rate changing during the run.
> > >>>
> > >>> patch V1:
> > >>>
> > >>>
> > >>>
> > >>> Patch V2:
> > >>>
> > >>>
> > >>> The Y-axis is the value of rate, the V1 is very aggressive as it jumps
> > >>> instantly from a minimum 8 to around 10 million. And the patch V2 can
> > >>> control the rate under 5000 during the run, and after the first round of
> > >>> writeback, it can stay even under 2500, so this proves we don’t need to
> > >>> be as aggressive as V1 to get out of the high fragment situation which
> > >>> eventually causes all writes hitting the backing device. This looks very
> > >>> reasonable for me now.
> > >>>
> > >>> Note that the fio command that I used is consuming the bucket quite
> > >>> aggressively, so it had to hit the third stage which has the highest
> > >>> aggressiveness, but I believe this is not true in a real production env,
> > >>> real production env won’t consume buckets that aggressively, so I expect
> > >>> stage 3 may not very often be needed to hit.
> > >>>
> > >>>
> > >>> As discussed, I'll run multiple block size testing on at least 1TB NVME
> > >>> device later.
> > >>> But it might take some time.
> > >>>
> > >>>
> > >>> Regards,
> > >>> Dongdong
> > >>>
> > >>> On Tue, Jan 5, 2021 at 12:33 PM Coly Li <[email protected]
> > >>> <mailto:[email protected]>> wrote:
> > >>>
> > >>> On 1/5/21 11:44 AM, Dongdong Tao wrote:
> > >>> > Hey Coly,
> > >>> >
> > >>> > This is the second version of the patch, please allow me to explain a
> > >>> > bit for this patch:
> > >>> >
> > >>> > We accelerate the rate in 3 stages with different aggressiveness, the
> > >>> > first stage starts when dirty buckets percent reach above
> > >>> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW(50), the second is
> > >>> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID(57) and the third is
> > >>> > BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH(64). By default the first stage
> > >>> > tries to writeback the amount of dirty data in one bucket (on average)
> > >>> > in (1 / (dirty_buckets_percent - 50)) second, the second stage
> > >>> tries to
> > >>> > writeback the amount of dirty data in one bucket in (1 /
> > >>> > (dirty_buckets_percent - 57)) * 200 millisecond. The third stage tries
> > >>> > to writeback the amount of dirty data in one bucket in (1 /
> > >>> > (dirty_buckets_percent - 64)) * 20 millisecond.
> > >>> >
> > >>> > As we can see, there are two writeback aggressiveness increasing
> > >>> > strategies, one strategy is with the increasing of the stage, the
> > >>> first
> > >>> > stage is the easy-going phase whose initial rate is trying to
> > >>> write back
> > >>> > dirty data of one bucket in 1 second, the second stage is a bit more
> > >>> > aggressive, the initial rate tries to writeback the dirty data of one
> > >>> > bucket in 200 ms, the last stage is even more, whose initial rate
> > >>> tries
> > >>> > to writeback the dirty data of one bucket in 20 ms. This makes sense,
> > >>> > one reason is that if the preceding stage couldn’t get the
> > >>> fragmentation
> > >>> > to a fine stage, then the next stage should increase the
> > >>> aggressiveness
> > >>> > properly, also it is because the later stage is closer to the
> > >>> > bch_cutoff_writeback_sync. Another aggressiveness increasing
> > >>> strategy is
> > >>> > with the increasing of dirty bucket percent within each stage, the
> > >>> first
> > >>> > strategy controls the initial writeback rate of each stage, while this
> > >>> > one increases the rate based on the initial rate, which is
> > >>> initial_rate
> > >>> > * (dirty bucket percent - BCH_WRITEBACK_FRAGMENT_THRESHOLD_X).
> > >>> >
> > >>> > The initial rate can be controlled by 3 parameters
> > >>> > writeback_rate_fp_term_low, writeback_rate_fp_term_mid,
> > >>> > writeback_rate_fp_term_high, they are default 1, 5, 50, users can
> > >>> adjust
> > >>> > them based on their needs.
> > >>> >
> > >>> > The reason that I choose 50, 57, 64 as the threshold value is because
> > >>> > the GC must be triggered at least once during each stage due to the
> > >>> > “sectors_to_gc” being set to 1/16 (6.25 %) of the total cache
> > >>> size. So,
> > >>> > the hope is that the first and second stage can get us back to good
> > >>> > shape in most situations by smoothly writing back the dirty data
> > >>> without
> > >>> > giving too much stress to the backing devices, but it might still
> > >>> enter
> > >>> > the third stage if the bucket consumption is very aggressive.
> > >>> >
> > >>> > This patch use (dirty / dirty_buckets) * fp_term to calculate the
> > >>> rate,
> > >>> > this formula means that we want to writeback (dirty /
> > >>> dirty_buckets) in
> > >>> > 1/fp_term second, fp_term is calculated by above aggressiveness
> > >>> > controller, “dirty” is the current dirty sectors, “dirty_buckets”
> > >>> is the
> > >>> > current dirty buckets, so (dirty / dirty_buckets) means the average
> > >>> > dirty sectors in one bucket, the value is between 0 to 1024 for the
> > >>> > default setting, so this formula basically gives a hint that to
> > >>> reclaim
> > >>> > one bucket in 1/fp_term second. By using this semantic, we can have a
> > >>> > lower writeback rate when the amount of dirty data is decreasing and
> > >>> > overcome the fact that dirty buckets number is always increasing
> > >>> unless
> > >>> > GC happens.
> > >>> >
> > >>> > *Compare to the first patch:
> > >>> > *The first patch is trying to write back all the data in 40 seconds,
> > >>> > this will result in a very high writeback rate when the amount of
> > >>> dirty
> > >>> > data is big, this is mostly true for the large cache devices. The
> > >>> basic
> > >>> > problem is that the semantic of this patch is not ideal, because we
> > >>> > don’t really need to writeback all dirty data in order to solve this
> > >>> > issue, and the instant large increase of the rate is something I
> > >>> feel we
> > >>> > should better avoid (I like things to be smoothly changed unless no
> > >>> > choice: )).
> > >>> >
> > >>> > Before I get to this new patch(which I believe should be optimal
> > >>> for me
> > >>> > atm), there have been many tuning/testing iterations, eg. I’ve
> > >>> tried to
> > >>> > tune the algorithm to writeback ⅓ of the dirty data in a certain
> > >>> amount
> > >>> > of seconds, writeback 1/fragment of the dirty data in a certain amount
> > >>> > of seconds, writeback all the dirty data only in those error_buckets
> > >>> > (error buckets = dirty buckets - 50% of the total buckets) in a
> > >>> certain
> > >>> > amount of time. However, those all turn out not to be ideal, only the
> > >>> > semantic of the patch makes much sense for me and allows me to control
> > >>> > the rate in a more precise way.
> > >>> >
> > >>> > *Testing data:
> > >>> > *I'll provide the visualized testing data in the next couple of days
> > >>> > with 1TB NVME devices cache but with HDD as backing device since it's
> > >>> > what we mostly used in production env.
> > >>> > I have the data for 400GB NVME, let me prepare it and take it for
> > >>> you to
> > >>> > review.
> > >>> [snipped]
> > >>>
> > >>> Hi Dongdong,
> > >>>
> > >>> Thanks for the update and continuous effort on this idea.
> > >>>
> > >>> Please keep in mind the writeback rate is just a advice rate for the
> > >>> writeback throughput, in real workload changing the writeback rate
> > >>> number does not change writeback throughput obviously.
> > >>>
> > >>> Currently I feel this is an interesting and promising idea for your
> > >>> patch, but I am not able to say whether it may take effect in real
> > >>> workload, so we do need convinced performance data on real workload and
> > >>> configuration.
> > >>>
> > >>> Of course I may also help on the benchmark, but my to-do list is long
> > >>> enough and it may take a very long delay time.
> > >>>
> > >>> Thanks.
> > >>>
> > >>> Coly Li
> > >>>
> > >>
> >
On 1/14/21 12:45 PM, Dongdong Tao wrote:
> Hi Coly,
>
> I've got the testing data for multiple threads with larger IO depth.
>
Hi Dongdong,
Thanks for the testing number.
> *Here is the testing steps:
> *1. make-bcache -B <> -C <> --writeback
>
> 2. Open two tabs, start different fio task in them at the same time.
> Tab1 run below fio command:
> sudo fio --name=random-writers --filename=/dev/bcache0 --ioengine=libaio
> --iodepth=32 --rw=randrw --blocksize=64k,8k --direct=1 --runtime=24000
>
> Tab2 run below fio command:
> sudo fio --name=random-writers2 --filename=/dev/bcache0
> --ioengine=libaio --iodepth=8 --rw=randwrite --bs=4k --rate_iops=150
> --direct=1 --write_lat_log=rw --log_avg_msec=20
>
Why you limit the iodep to 8 and iops to 150 on cache device?
For cache device the limitation is small. Iosp 150 with 4KB block size,
it means every hour writing (150*4*60*60=2160000KB=) 2GB data. For 35
hours it is only 70GB.
What if the iodeps is 128 or 64, and no iops rate limitation ?
> Note
> - Tab1 fio will run for 24000 seconds, which is the one to cause the
> fragmentation and made the cache_available_percent drops to under 40.
> - Tab2 fio is the one that I'm capturing the latency and I have let it
> run for about 35 hours, which is long enough to allow the
> cache_available_percent drops under 30.
> - This testing method utilized fio benchmark with larger read block
> size/small write block size to cause the high fragmentation, However in
> a real production env, there could be
> various reasons or a combination of various reasons to cause the high
> fragmentation, but I believe it should be ok to use any method to cause
> the fragmentation to verify if
> bcache with this patch is responding better than the master in this
> situation.
>
> *Below is the testing result:*
>
> The total run time is about 35 hours, the latency points in the charts
> for each run are 1.5 million
>
> Master:
> fio-lat-mater.png
>
> Master + patch:
> fio-lat-patch.png
> Combine them together:
> fio-lat-mix.png
>
> Now we can see the master is even worse when we increase the iodepth,
> which makes sense since the backing HDD is being stressed more hardly.
>
> *Below are the cache stats changing during the run:*
> Master:
> bcache-stats-master.png
>
> Master + the patch:
> bcache-stats-patch.png
>
> That's all the testing done with 400GB NVME with 512B block size.
>
> Coly, do you want me to continue the same testing on 1TB nvme with
> different block size ?
> or is it ok to skip the 1TB testing and continue the test with 400GB
> NVME but with different block size?
> feel free to let me know any other test scenarios that we should cover
> here.
Yes please, more testing is desired for performance improvement. So far
I don't see performance number for real high work load yet.
Thanks.
Coly Li
Hi Coly,
Why you limit the iodeph to 8 and iops to 150 on cache device?
For cache device the limitation is small. Iosp 150 with 4KB block size,
it means every hour writing (150*4*60*60=2160000KB=) 2GB data. For 35
hours it is only 70GB.
What if the iodepth is 128 or 64, and no iops rate limitation ?
-> There are two reasons why I limit the iodepth and iops rate.
1. If I don't limit them, the dirty cache will be filled up very
quickly within 20 minutes.
It's almost NVME speed before it reaches the 70
cutoff_writeback_sync, there is no way for any kind of writeback to
stop it from
filling up due to the huge gap between NVME and HDD in terms of
the throughput,
I don't think there is anything we can do about it? and it should
only happen in a benchmark world, not should in production.
The improvement I'm trying to do here is just for normal
production workload ,not for this benchmark scenario really.
I currently can't see any necessity to test this scenario, please
kindly let me know about this if I'm wrong.
2. The reason that I set iodepth to 8 and iops to 150 is based on the
experience that I observed from production env, mostly ceph,
ceph-osd has less than 10 thread(default setting) that will send
io to bcache in parallel. But I'm not sure about other applications.
I agree that we can increase the iodepth to 64 or 128 and it's
doable. But we have to limit the iops, 150 IOPS is a reasonable
workload.
The most busy ceph-osd that I've seen is about 1000 IOPS, but on
average is still only about 600.
I can set the IOPS to a higher value like 600 and the iodepth to
128 to perform the later test if it make sense to you?
Lastly, please allow me to clarify more about the production issue
that this patch is trying to address:
In the production env that hit this issue, it usually takes a very
long time (many take days) for the cache_available_percent to drop to
30, and the dirty data is mostly staying at a very low level (around
10 percent), which means that the bcache isn't being stressed very
hard most of the time.
There is no intention to save the cutoff_writeback_sync when the
bcache is being stressed without limitation, hope above make sense :)
By the way, my colleague and I are trying to gathering some production
bcache stats, I hope we can give you the performance number before and
after applying the patch.
Thanks,
Dongdong
On Thu, Jan 14, 2021 at 6:05 PM Coly Li <[email protected]> wrote:
>
> On 1/14/21 12:45 PM, Dongdong Tao wrote:
> > Hi Coly,
> >
> > I've got the testing data for multiple threads with larger IO depth.
> >
>
> Hi Dongdong,
>
> Thanks for the testing number.
>
> > *Here is the testing steps:
> > *1. make-bcache -B <> -C <> --writeback
> >
> > 2. Open two tabs, start different fio task in them at the same time.
> > Tab1 run below fio command:
> > sudo fio --name=random-writers --filename=/dev/bcache0 --ioengine=libaio
> > --iodepth=32 --rw=randrw --blocksize=64k,8k --direct=1 --runtime=24000
> >
> > Tab2 run below fio command:
> > sudo fio --name=random-writers2 --filename=/dev/bcache0
> > --ioengine=libaio --iodepth=8 --rw=randwrite --bs=4k --rate_iops=150
> > --direct=1 --write_lat_log=rw --log_avg_msec=20
> >
>
>
> Why you limit the iodep to 8 and iops to 150 on cache device?
> For cache device the limitation is small. Iosp 150 with 4KB block size,
> it means every hour writing (150*4*60*60=2160000KB=) 2GB data. For 35
> hours it is only 70GB.
>
>
> What if the iodeps is 128 or 64, and no iops rate limitation ?
>
>
> > Note
> > - Tab1 fio will run for 24000 seconds, which is the one to cause the
> > fragmentation and made the cache_available_percent drops to under 40.
> > - Tab2 fio is the one that I'm capturing the latency and I have let it
> > run for about 35 hours, which is long enough to allow the
> > cache_available_percent drops under 30.
> > - This testing method utilized fio benchmark with larger read block
> > size/small write block size to cause the high fragmentation, However in
> > a real production env, there could be
> > various reasons or a combination of various reasons to cause the high
> > fragmentation, but I believe it should be ok to use any method to cause
> > the fragmentation to verify if
> > bcache with this patch is responding better than the master in this
> > situation.
> >
> > *Below is the testing result:*
> >
> > The total run time is about 35 hours, the latency points in the charts
> > for each run are 1.5 million
> >
> > Master:
> > fio-lat-mater.png
> >
> > Master + patch:
> > fio-lat-patch.png
> > Combine them together:
> > fio-lat-mix.png
> >
> > Now we can see the master is even worse when we increase the iodepth,
> > which makes sense since the backing HDD is being stressed more hardly.
> >
> > *Below are the cache stats changing during the run:*
> > Master:
> > bcache-stats-master.png
> >
> > Master + the patch:
> > bcache-stats-patch.png
> >
> > That's all the testing done with 400GB NVME with 512B block size.
> >
> > Coly, do you want me to continue the same testing on 1TB nvme with
> > different block size ?
> > or is it ok to skip the 1TB testing and continue the test with 400GB
> > NVME but with different block size?
> > feel free to let me know any other test scenarios that we should cover
> > here.
>
> Yes please, more testing is desired for performance improvement. So far
> I don't see performance number for real high work load yet.
>
> Thanks.
>
> Coly Li
>
On 1/14/21 8:22 PM, Dongdong Tao wrote:
> Hi Coly,
>
> Why you limit the iodeph to 8 and iops to 150 on cache device?
> For cache device the limitation is small. Iosp 150 with 4KB block size,
> it means every hour writing (150*4*60*60=2160000KB=) 2GB data. For 35
> hours it is only 70GB.
>
>
> What if the iodepth is 128 or 64, and no iops rate limitation ?
> -> There are two reasons why I limit the iodepth and iops rate.
> 1. If I don't limit them, the dirty cache will be filled up very
> quickly within 20 minutes.
> It's almost NVME speed before it reaches the 70
> cutoff_writeback_sync, there is no way for any kind of writeback to
> stop it from
> filling up due to the huge gap between NVME and HDD in terms of
> the throughput,
> I don't think there is anything we can do about it? and it should
> only happen in a benchmark world, not should in production.
> The improvement I'm trying to do here is just for normal
> production workload ,not for this benchmark scenario really.
> I currently can't see any necessity to test this scenario, please
> kindly let me know about this if I'm wrong.
>
> 2. The reason that I set iodepth to 8 and iops to 150 is based on the
> experience that I observed from production env, mostly ceph,
> ceph-osd has less than 10 thread(default setting) that will send
> io to bcache in parallel. But I'm not sure about other applications.
> I agree that we can increase the iodepth to 64 or 128 and it's
> doable. But we have to limit the iops, 150 IOPS is a reasonable
> workload.
> The most busy ceph-osd that I've seen is about 1000 IOPS, but on
> average is still only about 600.
> I can set the IOPS to a higher value like 600 and the iodepth to
> 128 to perform the later test if it make sense to you?
>
OK, now I know the reason with the extra information. Since the cache
device is filled up within 20 minutes, it is unnecessary to do the
faster testing on your side. Let me do it later on my hardware.
> Lastly, please allow me to clarify more about the production issue
> that this patch is trying to address:
>
> In the production env that hit this issue, it usually takes a very
> long time (many take days) for the cache_available_percent to drop to
> 30, and the dirty data is mostly staying at a very low level (around
> 10 percent), which means that the bcache isn't being stressed very
> hard most of the time.
> There is no intention to save the cutoff_writeback_sync when the
> bcache is being stressed without limitation, hope above make sense :)
>
Yes you explained clearly previously. What I worried was whether a
faster writeback may interfere throughput and latency of regular I/O
regular I/Os.
From your current testing data it looks find with me.
> By the way, my colleague and I are trying to gathering some production
> bcache stats, I hope we can give you the performance number before and
> after applying the patch.
Yes that will be great.
And could you please gather all current data chats into a single email,
and reference it in your patch via lore ? Then for people don't
subscribe linux-bcache mailing list, they may find all the posted
performance data from you patch.
In general your testing data is convinced IMHO, and I will add your
updated patch for 5.12 merge window.
Thanks.
Coly Li
>
>
> On Thu, Jan 14, 2021 at 6:05 PM Coly Li <[email protected]> wrote:
>>
>> On 1/14/21 12:45 PM, Dongdong Tao wrote:
>>> Hi Coly,
>>>
>>> I've got the testing data for multiple threads with larger IO depth.
>>>
>>
>> Hi Dongdong,
>>
>> Thanks for the testing number.
>>
>>> *Here is the testing steps:
>>> *1. make-bcache -B <> -C <> --writeback
>>>
>>> 2. Open two tabs, start different fio task in them at the same time.
>>> Tab1 run below fio command:
>>> sudo fio --name=random-writers --filename=/dev/bcache0 --ioengine=libaio
>>> --iodepth=32 --rw=randrw --blocksize=64k,8k --direct=1 --runtime=24000
>>>
>>> Tab2 run below fio command:
>>> sudo fio --name=random-writers2 --filename=/dev/bcache0
>>> --ioengine=libaio --iodepth=8 --rw=randwrite --bs=4k --rate_iops=150
>>> --direct=1 --write_lat_log=rw --log_avg_msec=20
>>>
>>
>>
>> Why you limit the iodep to 8 and iops to 150 on cache device?
>> For cache device the limitation is small. Iosp 150 with 4KB block size,
>> it means every hour writing (150*4*60*60=2160000KB=) 2GB data. For 35
>> hours it is only 70GB.
>>
>>
>> What if the iodeps is 128 or 64, and no iops rate limitation ?
>>
>>
>>> Note
>>> - Tab1 fio will run for 24000 seconds, which is the one to cause the
>>> fragmentation and made the cache_available_percent drops to under 40.
>>> - Tab2 fio is the one that I'm capturing the latency and I have let it
>>> run for about 35 hours, which is long enough to allow the
>>> cache_available_percent drops under 30.
>>> - This testing method utilized fio benchmark with larger read block
>>> size/small write block size to cause the high fragmentation, However in
>>> a real production env, there could be
>>> various reasons or a combination of various reasons to cause the high
>>> fragmentation, but I believe it should be ok to use any method to cause
>>> the fragmentation to verify if
>>> bcache with this patch is responding better than the master in this
>>> situation.
>>>
>>> *Below is the testing result:*
>>>
>>> The total run time is about 35 hours, the latency points in the charts
>>> for each run are 1.5 million
>>>
>>> Master:
>>> fio-lat-mater.png
>>>
>>> Master + patch:
>>> fio-lat-patch.png
>>> Combine them together:
>>> fio-lat-mix.png
>>>
>>> Now we can see the master is even worse when we increase the iodepth,
>>> which makes sense since the backing HDD is being stressed more hardly.
>>>
>>> *Below are the cache stats changing during the run:*
>>> Master:
>>> bcache-stats-master.png
>>>
>>> Master + the patch:
>>> bcache-stats-patch.png
>>>
>>> That's all the testing done with 400GB NVME with 512B block size.
>>>
>>> Coly, do you want me to continue the same testing on 1TB nvme with
>>> different block size ?
>>> or is it ok to skip the 1TB testing and continue the test with 400GB
>>> NVME but with different block size?
>>> feel free to let me know any other test scenarios that we should cover
>>> here.
>>
>> Yes please, more testing is desired for performance improvement. So far
>> I don't see performance number for real high work load yet.
>>
>> Thanks.
>>
>> Coly Li
>>
Hi Coly,
Apologies for any confusion that I might have caused, and thanks a lot
for your patience and your help !
On Thu, Jan 14, 2021 at 9:31 PM Coly Li <[email protected]> wrote:
>
> On 1/14/21 8:22 PM, Dongdong Tao wrote:
> > Hi Coly,
> >
> > Why you limit the iodeph to 8 and iops to 150 on cache device?
> > For cache device the limitation is small. Iosp 150 with 4KB block size,
> > it means every hour writing (150*4*60*60=2160000KB=) 2GB data. For 35
> > hours it is only 70GB.
> >
> >
> > What if the iodepth is 128 or 64, and no iops rate limitation ?
> > -> There are two reasons why I limit the iodepth and iops rate.
> > 1. If I don't limit them, the dirty cache will be filled up very
> > quickly within 20 minutes.
> > It's almost NVME speed before it reaches the 70
> > cutoff_writeback_sync, there is no way for any kind of writeback to
> > stop it from
> > filling up due to the huge gap between NVME and HDD in terms of
> > the throughput,
> > I don't think there is anything we can do about it? and it should
> > only happen in a benchmark world, not should in production.
> > The improvement I'm trying to do here is just for normal
> > production workload ,not for this benchmark scenario really.
> > I currently can't see any necessity to test this scenario, please
> > kindly let me know about this if I'm wrong.
> >
> > 2. The reason that I set iodepth to 8 and iops to 150 is based on the
> > experience that I observed from production env, mostly ceph,
> > ceph-osd has less than 10 thread(default setting) that will send
> > io to bcache in parallel. But I'm not sure about other applications.
> > I agree that we can increase the iodepth to 64 or 128 and it's
> > doable. But we have to limit the iops, 150 IOPS is a reasonable
> > workload.
> > The most busy ceph-osd that I've seen is about 1000 IOPS, but on
> > average is still only about 600.
> > I can set the IOPS to a higher value like 600 and the iodepth to
> > 128 to perform the later test if it make sense to you?
> >
>
> OK, now I know the reason with the extra information. Since the cache
> device is filled up within 20 minutes, it is unnecessary to do the
> faster testing on your side. Let me do it later on my hardware.
>
>
> > Lastly, please allow me to clarify more about the production issue
> > that this patch is trying to address:
> >
> > In the production env that hit this issue, it usually takes a very
> > long time (many take days) for the cache_available_percent to drop to
> > 30, and the dirty data is mostly staying at a very low level (around
> > 10 percent), which means that the bcache isn't being stressed very
> > hard most of the time.
> > There is no intention to save the cutoff_writeback_sync when the
> > bcache is being stressed without limitation, hope above make sense :)
> >
>
> Yes you explained clearly previously. What I worried was whether a
> faster writeback may interfere throughput and latency of regular I/O
> regular I/Os.
>
> From your current testing data it looks find with me.
>
>
> > By the way, my colleague and I are trying to gathering some production
> > bcache stats, I hope we can give you the performance number before and
> > after applying the patch.
>
> Yes that will be great.
>
> And could you please gather all current data chats into a single email,
> and reference it in your patch via lore ? Then for people don't
> subscribe linux-bcache mailing list, they may find all the posted
> performance data from you patch.
>
Sounds good, I'll update the patch comment with reference data.
But it seems like the linux mailing list doesn't accept chart ?
(always been detected as SPAM)
But, I can't be sure, I'll try to send it again, but if not, I'll put
all those data into a google doc.
> In general your testing data is convinced IMHO, and I will add your
> updated patch for 5.12 merge window.
>
Thank you Coly, that's great !!!
>
> Thanks.
>
> Coly Li
>
>
> >
> >
> > On Thu, Jan 14, 2021 at 6:05 PM Coly Li <[email protected]> wrote:
> >>
> >> On 1/14/21 12:45 PM, Dongdong Tao wrote:
> >>> Hi Coly,
> >>>
> >>> I've got the testing data for multiple threads with larger IO depth.
> >>>
> >>
> >> Hi Dongdong,
> >>
> >> Thanks for the testing number.
> >>
> >>> *Here is the testing steps:
> >>> *1. make-bcache -B <> -C <> --writeback
> >>>
> >>> 2. Open two tabs, start different fio task in them at the same time.
> >>> Tab1 run below fio command:
> >>> sudo fio --name=random-writers --filename=/dev/bcache0 --ioengine=libaio
> >>> --iodepth=32 --rw=randrw --blocksize=64k,8k --direct=1 --runtime=24000
> >>>
> >>> Tab2 run below fio command:
> >>> sudo fio --name=random-writers2 --filename=/dev/bcache0
> >>> --ioengine=libaio --iodepth=8 --rw=randwrite --bs=4k --rate_iops=150
> >>> --direct=1 --write_lat_log=rw --log_avg_msec=20
> >>>
> >>
> >>
> >> Why you limit the iodep to 8 and iops to 150 on cache device?
> >> For cache device the limitation is small. Iosp 150 with 4KB block size,
> >> it means every hour writing (150*4*60*60=2160000KB=) 2GB data. For 35
> >> hours it is only 70GB.
> >>
> >>
> >> What if the iodeps is 128 or 64, and no iops rate limitation ?
> >>
> >>
> >>> Note
> >>> - Tab1 fio will run for 24000 seconds, which is the one to cause the
> >>> fragmentation and made the cache_available_percent drops to under 40.
> >>> - Tab2 fio is the one that I'm capturing the latency and I have let it
> >>> run for about 35 hours, which is long enough to allow the
> >>> cache_available_percent drops under 30.
> >>> - This testing method utilized fio benchmark with larger read block
> >>> size/small write block size to cause the high fragmentation, However in
> >>> a real production env, there could be
> >>> various reasons or a combination of various reasons to cause the high
> >>> fragmentation, but I believe it should be ok to use any method to cause
> >>> the fragmentation to verify if
> >>> bcache with this patch is responding better than the master in this
> >>> situation.
> >>>
> >>> *Below is the testing result:*
> >>>
> >>> The total run time is about 35 hours, the latency points in the charts
> >>> for each run are 1.5 million
> >>>
> >>> Master:
> >>> fio-lat-mater.png
> >>>
> >>> Master + patch:
> >>> fio-lat-patch.png
> >>> Combine them together:
> >>> fio-lat-mix.png
> >>>
> >>> Now we can see the master is even worse when we increase the iodepth,
> >>> which makes sense since the backing HDD is being stressed more hardly.
> >>>
> >>> *Below are the cache stats changing during the run:*
> >>> Master:
> >>> bcache-stats-master.png
> >>>
> >>> Master + the patch:
> >>> bcache-stats-patch.png
> >>>
> >>> That's all the testing done with 400GB NVME with 512B block size.
> >>>
> >>> Coly, do you want me to continue the same testing on 1TB nvme with
> >>> different block size ?
> >>> or is it ok to skip the 1TB testing and continue the test with 400GB
> >>> NVME but with different block size?
> >>> feel free to let me know any other test scenarios that we should cover
> >>> here.
> >>
> >> Yes please, more testing is desired for performance improvement. So far
> >> I don't see performance number for real high work load yet.
> >>
> >> Thanks.
> >>
> >> Coly Li
> >>
>
Hi Dongdong,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linus/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Dongdong-Tao/bcache-consider-the-fragmentation-when-update-the-writeback-rate/20210105-110903
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git e71ba9452f0b5b2e8dc8aa5445198cd9214a6a62
config: i386-randconfig-a002-20200806 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce (this is a W=1 build):
# https://github.com/0day-ci/linux/commit/7777fef68d1401235db42dd0d59c5c3dba3d42d3
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Dongdong-Tao/bcache-consider-the-fragmentation-when-update-the-writeback-rate/20210105-110903
git checkout 7777fef68d1401235db42dd0d59c5c3dba3d42d3
# save the attached .config to linux build tree
make W=1 ARCH=i386
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
ld: drivers/md/bcache/writeback.o: in function `__update_writeback_rate':
>> drivers/md/bcache/writeback.c:106: undefined reference to `__divdi3'
>> ld: drivers/md/bcache/writeback.c:120: undefined reference to `__divdi3'
vim +106 drivers/md/bcache/writeback.c
60
61 static void __update_writeback_rate(struct cached_dev *dc)
62 {
63 /*
64 * PI controller:
65 * Figures out the amount that should be written per second.
66 *
67 * First, the error (number of sectors that are dirty beyond our
68 * target) is calculated. The error is accumulated (numerically
69 * integrated).
70 *
71 * Then, the proportional value and integral value are scaled
72 * based on configured values. These are stored as inverses to
73 * avoid fixed point math and to make configuration easy-- e.g.
74 * the default value of 40 for writeback_rate_p_term_inverse
75 * attempts to write at a rate that would retire all the dirty
76 * blocks in 40 seconds.
77 *
78 * The writeback_rate_i_inverse value of 10000 means that 1/10000th
79 * of the error is accumulated in the integral term per second.
80 * This acts as a slow, long-term average that is not subject to
81 * variations in usage like the p term.
82 */
83 int64_t target = __calc_target_rate(dc);
84 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
85 int64_t error = dirty - target;
86 int64_t proportional_scaled =
87 div_s64(error, dc->writeback_rate_p_term_inverse);
88 int64_t integral_scaled;
89 uint32_t new_rate;
90
91 /*
92 * We need to consider the number of dirty buckets as well
93 * when calculating the proportional_scaled, Otherwise we might
94 * have an unreasonable small writeback rate at a highly fragmented situation
95 * when very few dirty sectors consumed a lot dirty buckets, the
96 * worst case is when dirty_data reached writeback_percent and
97 * dirty buckets reached to cutoff_writeback_sync, but the rate
98 * still will be at the minimum value, which will cause the write
99 * stuck at a non-writeback mode.
100 */
101 struct cache_set *c = dc->disk.c;
102
103 int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets;
104
105 if (c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) {
> 106 int64_t fragment = (dirty_buckets * c->cache->sb.bucket_size) / dirty;
107 int64_t fp_term;
108 int64_t fps;
109
110 if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) {
111 fp_term = dc->writeback_rate_fp_term_low *
112 (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW);
113 } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) {
114 fp_term = dc->writeback_rate_fp_term_mid *
115 (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID);
116 } else {
117 fp_term = dc->writeback_rate_fp_term_high *
118 (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH);
119 }
> 120 fps = (dirty / dirty_buckets) * fp_term;
121 if (fragment > 3 && fps > proportional_scaled) {
122 //Only overrite the p when fragment > 3
123 proportional_scaled = fps;
124 }
125 }
126
127 if ((error < 0 && dc->writeback_rate_integral > 0) ||
128 (error > 0 && time_before64(local_clock(),
129 dc->writeback_rate.next + NSEC_PER_MSEC))) {
130 /*
131 * Only decrease the integral term if it's more than
132 * zero. Only increase the integral term if the device
133 * is keeping up. (Don't wind up the integral
134 * ineffectively in either case).
135 *
136 * It's necessary to scale this by
137 * writeback_rate_update_seconds to keep the integral
138 * term dimensioned properly.
139 */
140 dc->writeback_rate_integral += error *
141 dc->writeback_rate_update_seconds;
142 }
143
144 integral_scaled = div_s64(dc->writeback_rate_integral,
145 dc->writeback_rate_i_term_inverse);
146
147 new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
148 dc->writeback_rate_minimum, NSEC_PER_SEC);
149
150 dc->writeback_rate_proportional = proportional_scaled;
151 dc->writeback_rate_integral_scaled = integral_scaled;
152 dc->writeback_rate_change = new_rate -
153 atomic_long_read(&dc->writeback_rate.rate);
154 atomic_long_set(&dc->writeback_rate.rate, new_rate);
155 dc->writeback_rate_target = target;
156 }
157
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]