2018-07-31 20:39:03

by Dennis Zhou

[permalink] [raw]
Subject: [PATCH] block: make iolatency avg_lat exponentially decay

From: "Dennis Zhou (Facebook)" <[email protected]>

Currently, avg_lat is calculated by accumulating the mean of every
window in a long running cumulative average. As time goes on, the metric
becomes less and less useful due to the accumulated history.

This patch reuses the same calculation done in load averages to make the
avg_lat metric more lively. Unlike load averages, the avg only advances
when a window elapses (due to an io). Idle periods extend the most
recent window. Bucketing is used to limit the history of avg_lat by
binding it to the window size. So, the window range for 1/exp (decay
rate) is [1 min, 2.5 min) when windows elapse immediately.

Signed-off-by: Dennis Zhou <[email protected]>
---
block/blk-iolatency.c | 45 ++++++++++++++++++++++++++++++-------------
1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index bb59b2929e0d..1db3244eac05 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -69,6 +69,7 @@
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/memcontrol.h>
+#include <linux/sched/loadavg.h>
#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include "blk-rq-qos.h"
@@ -127,7 +128,6 @@ struct iolatency_grp {

/* total running average of our io latency. */
u64 total_lat_avg;
- u64 total_lat_nr;

/* Our current number of IO's for the last summation. */
u64 nr_samples;
@@ -135,6 +135,24 @@ struct iolatency_grp {
struct child_latency_info child_lat;
};

+#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
+#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
+/*
+ * These are the constants used to fake the fixed-point moving average
+ * calculation just like load average. The latency window is bucketed to
+ * try to approximately calculate average latency for the last 1 minute.
+ */
+#define BLKIOLATENCY_NR_EXP_FACTORS 5
+#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
+ (BLKIOLATENCY_NR_EXP_FACTORS - 1))
+static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
+ 2045, // exp(1/600) - 600 samples
+ 2039, // exp(1/240) - 240 samples
+ 2031, // exp(1/120) - 120 samples
+ 2023, // exp(1/80) - 80 samples
+ 2014, // exp(1/60) - 60 samples
+};
+
static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct iolatency_grp, pd) : NULL;
@@ -462,7 +480,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
struct child_latency_info *lat_info;
struct blk_rq_stat stat;
unsigned long flags;
- int cpu;
+ int cpu, exp_idx;

blk_rq_stat_init(&stat);
preempt_disable();
@@ -480,11 +498,10 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)

lat_info = &parent->child_lat;

- iolat->total_lat_avg =
- div64_u64((iolat->total_lat_avg * iolat->total_lat_nr) +
- stat.mean, iolat->total_lat_nr + 1);
-
- iolat->total_lat_nr++;
+ exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
+ iolat->cur_win_nsec / BLKIOLATENCY_EXP_BUCKET_SIZE);
+ CALC_LOAD(iolat->total_lat_avg, iolatency_exp_factors[exp_idx],
+ stat.mean);

/* Everything is ok and we don't need to adjust the scale. */
if (stat.mean <= iolat->min_lat_nsec &&
@@ -700,8 +717,9 @@ static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
u64 oldval = iolat->min_lat_nsec;

iolat->min_lat_nsec = val;
- iolat->cur_win_nsec = max_t(u64, val << 4, 100 * NSEC_PER_MSEC);
- iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, NSEC_PER_SEC);
+ iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE);
+ iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec,
+ BLKIOLATENCY_MAX_WIN_SIZE);

if (!oldval && val)
atomic_inc(&blkiolat->enabled);
@@ -811,13 +829,14 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
{
struct iolatency_grp *iolat = pd_to_lat(pd);
unsigned long long avg_lat = div64_u64(iolat->total_lat_avg, NSEC_PER_USEC);
+ unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);

if (iolat->rq_depth.max_depth == UINT_MAX)
- return scnprintf(buf, size, " depth=max avg_lat=%llu",
- avg_lat);
+ return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
+ avg_lat, cur_win);

- return scnprintf(buf, size, " depth=%u avg_lat=%llu",
- iolat->rq_depth.max_depth, avg_lat);
+ return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu",
+ iolat->rq_depth.max_depth, avg_lat, cur_win);
}


--
2.17.1



2018-07-31 21:20:43

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH] block: make iolatency avg_lat exponentially decay

Hi Dennis,

this generally looks good to me. Just two small nit picks:

On Tue, Jul 31, 2018 at 01:36:47PM -0700, Dennis Zhou wrote:
> @@ -135,6 +135,24 @@ struct iolatency_grp {
> struct child_latency_info child_lat;
> };
>
> +#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
> +#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
> +/*
> + * These are the constants used to fake the fixed-point moving average
> + * calculation just like load average. The latency window is bucketed to
> + * try to approximately calculate average latency for the last 1 minute.
> + */
> +#define BLKIOLATENCY_NR_EXP_FACTORS 5
> +#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
> + (BLKIOLATENCY_NR_EXP_FACTORS - 1))
> +static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
> + 2045, // exp(1/600) - 600 samples
> + 2039, // exp(1/240) - 240 samples
> + 2031, // exp(1/120) - 120 samples
> + 2023, // exp(1/80) - 80 samples
> + 2014, // exp(1/60) - 60 samples

Might be useful to drop the FIXED_1 name in a comment here. It says
"fixed-point", and "load average", but since the numbers are directly
in relationship to that constant, it'd be good to name it I think.

> @@ -462,7 +480,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
> struct child_latency_info *lat_info;
> struct blk_rq_stat stat;
> unsigned long flags;
> - int cpu;
> + int cpu, exp_idx;
>
> blk_rq_stat_init(&stat);
> preempt_disable();
> @@ -480,11 +498,10 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
>
> lat_info = &parent->child_lat;
>
> - iolat->total_lat_avg =
> - div64_u64((iolat->total_lat_avg * iolat->total_lat_nr) +
> - stat.mean, iolat->total_lat_nr + 1);
> -
> - iolat->total_lat_nr++;
> + exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
> + iolat->cur_win_nsec / BLKIOLATENCY_EXP_BUCKET_SIZE);
> + CALC_LOAD(iolat->total_lat_avg, iolatency_exp_factors[exp_idx],
> + stat.mean);

The load average keeps the running value in fixed point presentation
to avoid rounding errors. I guess because this is IO time in ns, the
values are so much higher than the FIXED_1 denominator (2048) that
rounding errors are negligible, and we don't need to bother with it.

Can you mention that in a comment, please?

2018-08-01 00:15:42

by Dennis Zhou

[permalink] [raw]
Subject: Re: [PATCH] block: make iolatency avg_lat exponentially decay

Hi Johannes,

On Tue, Jul 31, 2018 at 05:21:50PM -0400, Johannes Weiner wrote:
> Hi Dennis,
>
> this generally looks good to me. Just two small nit picks:
>
> On Tue, Jul 31, 2018 at 01:36:47PM -0700, Dennis Zhou wrote:
> > @@ -135,6 +135,24 @@ struct iolatency_grp {
> > struct child_latency_info child_lat;
> > };
> >
> > +#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
> > +#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
> > +/*
> > + * These are the constants used to fake the fixed-point moving average
> > + * calculation just like load average. The latency window is bucketed to
> > + * try to approximately calculate average latency for the last 1 minute.
> > + */
> > +#define BLKIOLATENCY_NR_EXP_FACTORS 5
> > +#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
> > + (BLKIOLATENCY_NR_EXP_FACTORS - 1))
> > +static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
> > + 2045, // exp(1/600) - 600 samples
> > + 2039, // exp(1/240) - 240 samples
> > + 2031, // exp(1/120) - 120 samples
> > + 2023, // exp(1/80) - 80 samples
> > + 2014, // exp(1/60) - 60 samples
>
> Might be useful to drop the FIXED_1 name in a comment here. It says
> "fixed-point", and "load average", but since the numbers are directly
> in relationship to that constant, it'd be good to name it I think.
>

I've added a comment in v2 that points out FIXED_1 and mentions its
value is 2048. I also explained a little more about the samples and
binding the 1/exp window.

> > + exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
> > + iolat->cur_win_nsec / BLKIOLATENCY_EXP_BUCKET_SIZE);
> > + CALC_LOAD(iolat->total_lat_avg, iolatency_exp_factors[exp_idx],
> > + stat.mean);
>
> The load average keeps the running value in fixed point presentation
> to avoid rounding errors. I guess because this is IO time in ns, the
> values are so much higher than the FIXED_1 denominator (2048) that
> rounding errors are negligible, and we don't need to bother with it.
>
> Can you mention that in a comment, please?

I've added what you said here in a comment above this.

Thanks,
Dennis