2024-01-26 22:09:43

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 1/5] mean and variance: Promote to lib/math

Small statistics library, for taking in a series of value and computing
mean, weighted mean, standard deviation and weighted deviation.

The main use case is for statistics on latency measurements.

Signed-off-by: Kent Overstreet <[email protected]>
Cc: Daniel Hill <[email protected]>
Cc: Darrick J. Wong <[email protected]>
---
MAINTAINERS | 9 +++++++++
fs/bcachefs/Kconfig | 10 +---------
fs/bcachefs/Makefile | 3 ---
fs/bcachefs/util.c | 2 +-
fs/bcachefs/util.h | 3 +--
{fs/bcachefs => include/linux}/mean_and_variance.h | 0
lib/Kconfig.debug | 9 +++++++++
lib/math/Kconfig | 3 +++
lib/math/Makefile | 2 ++
{fs/bcachefs => lib/math}/mean_and_variance.c | 3 +--
{fs/bcachefs => lib/math}/mean_and_variance_test.c | 3 +--
11 files changed, 28 insertions(+), 19 deletions(-)
rename {fs/bcachefs => include/linux}/mean_and_variance.h (100%)
rename {fs/bcachefs => lib/math}/mean_and_variance.c (99%)
rename {fs/bcachefs => lib/math}/mean_and_variance_test.c (99%)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d1052fa6a69..de635cfd354d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13379,6 +13379,15 @@ S: Maintained
F: drivers/net/mdio/mdio-regmap.c
F: include/linux/mdio/mdio-regmap.h

+MEAN AND VARIANCE LIBRARY
+M: Daniel B. Hill <[email protected]>
+M: Kent Overstreet <[email protected]>
+S: Maintained
+T: git https://github.com/YellowOnion/linux/
+F: include/linux/mean_and_variance.h
+F: lib/math/mean_and_variance.c
+F: lib/math/mean_and_variance_test.c
+
MEASUREMENT COMPUTING CIO-DAC IIO DRIVER
M: William Breathitt Gray <[email protected]>
L: [email protected]
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5cdfef3b551a..72d1179262b3 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -24,6 +24,7 @@ config BCACHEFS_FS
select XXHASH
select SRCU
select SYMBOLIC_ERRNAME
+ select MEAN_AND_VARIANCE
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
@@ -86,12 +87,3 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
Instead of immediately sleeping when attempting to take a six lock that
is held by another thread, spin for a short while, as long as the
thread owning the lock is running.
-
-config MEAN_AND_VARIANCE_UNIT_TEST
- tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
- depends on KUNIT
- depends on BCACHEFS_FS
- default KUNIT_ALL_TESTS
- help
- This option enables the kunit tests for mean_and_variance module.
- If unsure, say N.
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 1a05cecda7cc..b11ba74b8ad4 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -57,7 +57,6 @@ bcachefs-y := \
keylist.o \
logged_ops.o \
lru.o \
- mean_and_variance.o \
migrate.o \
move.o \
movinggc.o \
@@ -88,5 +87,3 @@ bcachefs-y := \
util.o \
varint.o \
xattr.o
-
-obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 56b815fd9fc6..d7ea95abb9df 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -22,9 +22,9 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/sched/clock.h>
+#include <linux/mean_and_variance.h>

#include "eytzinger.h"
-#include "mean_and_variance.h"
#include "util.h"

static const char si_units[] = "?kMGTPEZY";
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b414736d59a5..0059481995ef 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -17,8 +17,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
-
-#include "mean_and_variance.h"
+#include <linux/mean_and_variance.h>

#include "darray.h"

diff --git a/fs/bcachefs/mean_and_variance.h b/include/linux/mean_and_variance.h
similarity index 100%
rename from fs/bcachefs/mean_and_variance.h
rename to include/linux/mean_and_variance.h
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 975a07f9f1cc..817ddfe132cd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2191,6 +2191,15 @@ config CPUMASK_KUNIT_TEST

If unsure, say N.

+config MEAN_AND_VARIANCE_UNIT_TEST
+ tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ select MEAN_AND_VARIANCE
+ default KUNIT_ALL_TESTS
+ help
+ This option enables the kunit tests for mean_and_variance module.
+ If unsure, say N.
+
config TEST_LIST_SORT
tristate "Linked list sorting test" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/lib/math/Kconfig b/lib/math/Kconfig
index 0634b428d0cb..7530ae9a3584 100644
--- a/lib/math/Kconfig
+++ b/lib/math/Kconfig
@@ -15,3 +15,6 @@ config PRIME_NUMBERS

config RATIONAL
tristate
+
+config MEAN_AND_VARIANCE
+ tristate
diff --git a/lib/math/Makefile b/lib/math/Makefile
index 91fcdb0c9efe..8cdfa13a67ce 100644
--- a/lib/math/Makefile
+++ b/lib/math/Makefile
@@ -4,6 +4,8 @@ obj-y += div64.o gcd.o lcm.o int_log.o int_pow.o int_sqrt.o reciprocal_div.o
obj-$(CONFIG_CORDIC) += cordic.o
obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
obj-$(CONFIG_RATIONAL) += rational.o
+obj-$(CONFIG_MEAN_AND_VARIANCE) += mean_and_variance.o

obj-$(CONFIG_TEST_DIV64) += test_div64.o
obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o
+obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
diff --git a/fs/bcachefs/mean_and_variance.c b/lib/math/mean_and_variance.c
similarity index 99%
rename from fs/bcachefs/mean_and_variance.c
rename to lib/math/mean_and_variance.c
index bf0ef668fd38..ba90293204ba 100644
--- a/fs/bcachefs/mean_and_variance.c
+++ b/lib/math/mean_and_variance.c
@@ -40,10 +40,9 @@
#include <linux/limits.h>
#include <linux/math.h>
#include <linux/math64.h>
+#include <linux/mean_and_variance.h>
#include <linux/module.h>

-#include "mean_and_variance.h"
-
u128_u u128_div(u128_u n, u64 d)
{
u128_u r;
diff --git a/fs/bcachefs/mean_and_variance_test.c b/lib/math/mean_and_variance_test.c
similarity index 99%
rename from fs/bcachefs/mean_and_variance_test.c
rename to lib/math/mean_and_variance_test.c
index 019583c3ca0e..f45591a169d8 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/lib/math/mean_and_variance_test.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <kunit/test.h>
-
-#include "mean_and_variance.h"
+#include <linux/mean_and_variance.h>

#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)

--
2.43.0



2024-01-26 22:10:04

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 3/5] bcachefs: bch2_time_stats_to_seq_buf()

Signed-off-by: Kent Overstreet <[email protected]>
---
fs/bcachefs/super.c | 2 +
fs/bcachefs/util.c | 129 +++++++++++++++++++++++++++++++++++++++-----
fs/bcachefs/util.h | 4 ++
3 files changed, 121 insertions(+), 14 deletions(-)

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index da8697c79a97..e74534096cb5 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1262,6 +1262,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,

bch2_time_stats_init(&ca->io_latency[READ]);
bch2_time_stats_init(&ca->io_latency[WRITE]);
+ ca->io_latency[READ].quantiles_enabled = true;
+ ca->io_latency[WRITE].quantiles_enabled = true;

ca->mi = bch2_mi_to_cpu(member);

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index c7cf9c6fcf9a..f2c8550c1331 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -505,10 +505,8 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64

void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
- const struct time_unit *u;
s64 f_mean = 0, d_mean = 0;
- u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
- int i;
+ u64 f_stddev = 0, d_stddev = 0;

if (stats->buffer) {
int cpu;
@@ -607,19 +605,122 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats

printbuf_tabstops_reset(out);

- i = eytzinger0_first(NR_QUANTILES);
- u = pick_time_units(stats->quantiles.entries[i].m);
+ if (stats->quantiles_enabled) {
+ int i = eytzinger0_first(NR_QUANTILES);
+ const struct time_unit *u =
+ pick_time_units(stats->quantiles.entries[i].m);
+ u64 last_q = 0;
+
+ prt_printf(out, "quantiles (%s):\t", u->name);
+ eytzinger0_for_each(i, NR_QUANTILES) {
+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+ u64 q = max(stats->quantiles.entries[i].m, last_q);
+ prt_printf(out, "%llu ", div_u64(q, u->nsecs));
+ if (is_last)
+ prt_newline(out);
+ last_q = q;
+ }
+ }
+}
+
+#include <linux/seq_buf.h>
+
+static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
+{
+ const struct time_unit *u = pick_time_units(ns);
+
+ seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
+}
+
+void bch2_time_stats_to_seq_buf(struct seq_buf *out, struct bch2_time_stats *stats)
+{
+ s64 f_mean = 0, d_mean = 0;
+ u64 f_stddev = 0, d_stddev = 0;
+
+ if (stats->buffer) {
+ int cpu;

- prt_printf(out, "quantiles (%s):\t", u->name);
- eytzinger0_for_each(i, NR_QUANTILES) {
- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+ spin_lock_irq(&stats->lock);
+ for_each_possible_cpu(cpu)
+ __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+ spin_unlock_irq(&stats->lock);
+ }

- q = max(stats->quantiles.entries[i].m, last_q);
- prt_printf(out, "%llu ",
- div_u64(q, u->nsecs));
- if (is_last)
- prt_newline(out);
- last_q = q;
+ /*
+ * avoid divide by zero
+ */
+ if (stats->freq_stats.n) {
+ f_mean = mean_and_variance_get_mean(stats->freq_stats);
+ f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+ d_mean = mean_and_variance_get_mean(stats->duration_stats);
+ d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+ }
+
+ seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
+
+ seq_buf_printf(out, " since mount recent\n");
+
+ seq_buf_printf(out, "duration of events\n");
+
+ seq_buf_printf(out, " min: ");
+ seq_buf_time_units_aligned(out, stats->min_duration);
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " max: ");
+ seq_buf_time_units_aligned(out, stats->max_duration);
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " total: ");
+ seq_buf_time_units_aligned(out, stats->total_duration);
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " mean: ");
+ seq_buf_time_units_aligned(out, d_mean);
+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " stddev: ");
+ seq_buf_time_units_aligned(out, d_stddev);
+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, "time between events\n");
+
+ seq_buf_printf(out, " min: ");
+ seq_buf_time_units_aligned(out, stats->min_freq);
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " max: ");
+ seq_buf_time_units_aligned(out, stats->max_freq);
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " mean: ");
+ seq_buf_time_units_aligned(out, f_mean);
+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " stddev: ");
+ seq_buf_time_units_aligned(out, f_stddev);
+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+ seq_buf_printf(out, "\n");
+
+ if (stats->quantiles_enabled) {
+ int i = eytzinger0_first(NR_QUANTILES);
+ const struct time_unit *u =
+ pick_time_units(stats->quantiles.entries[i].m);
+ u64 last_q = 0;
+
+ prt_printf(out, "quantiles (%s):\t", u->name);
+ eytzinger0_for_each(i, NR_QUANTILES) {
+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+ u64 q = max(stats->quantiles.entries[i].m, last_q);
+ seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
+ if (is_last)
+ seq_buf_printf(out, "\n");
+ last_q = q;
+ }
}
}
#else
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index c3b11c3d24ea..7ff2d4fe26f6 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -382,6 +382,7 @@ struct bch2_time_stat_buffer {

struct bch2_time_stats {
spinlock_t lock;
+ bool quantiles_enabled;
/* all fields are in nanoseconds */
u64 min_duration;
u64 max_duration;
@@ -435,6 +436,9 @@ static inline bool track_event_change(struct bch2_time_stats *stats,

void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);

+struct seq_buf;
+void bch2_time_stats_to_seq_buf(struct seq_buf *, struct bch2_time_stats *);
+
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);

--
2.43.0


2024-01-26 22:10:08

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 2/5] eytzinger: Promote to include/linux/

eytzinger trees are a faster alternative to binary search. They're a bit
more expensive to setup, but lookups perform much better assuming the
tree isn't entirely in cache.

Binary search is a worst case scenario for branch prediction and
prefetching, but eytzinger trees have children adjacent in memory and
thus we can prefetch before knowing the result of a comparison.

An eytzinger tree is a binary tree laid out in an array, with the same
geometry as the usual binary heap construction, but used as a search
tree instead.

Signed-off-by: Kent Overstreet <[email protected]>
---
fs/bcachefs/bset.c | 2 +-
fs/bcachefs/journal_seq_blacklist.c | 6 +-
fs/bcachefs/replicas.c | 17 ++-
fs/bcachefs/replicas.h | 3 +-
fs/bcachefs/super-io.h | 2 +-
fs/bcachefs/util.c | 145 +--------------------
fs/bcachefs/util.h | 4 -
{fs/bcachefs => include/linux}/eytzinger.h | 56 ++++----
lib/sort.c | 85 ++++++++++++
9 files changed, 136 insertions(+), 184 deletions(-)
rename {fs/bcachefs => include/linux}/eytzinger.h (78%)

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 3fd1085b6c61..1d77aa55d641 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -9,12 +9,12 @@
#include "bcachefs.h"
#include "btree_cache.h"
#include "bset.h"
-#include "eytzinger.h"
#include "trace.h"
#include "util.h"

#include <asm/unaligned.h>
#include <linux/console.h>
+#include <linux/eytzinger.h>
#include <linux/random.h>
#include <linux/prefetch.h>

diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 0200e299cfbb..024c9b1b323f 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -2,10 +2,11 @@

#include "bcachefs.h"
#include "btree_iter.h"
-#include "eytzinger.h"
#include "journal_seq_blacklist.h"
#include "super-io.h"

+#include <linux/eytzinger.h>
+
/*
* journal_seq_blacklist machinery:
*
@@ -119,8 +120,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
return ret ?: bch2_blacklist_table_initialize(c);
}

-static int journal_seq_blacklist_table_cmp(const void *_l,
- const void *_r, size_t size)
+static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
{
const struct journal_seq_blacklist_table_entry *l = _l;
const struct journal_seq_blacklist_table_entry *r = _r;
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index cc2672c12031..75fdce373f76 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -6,12 +6,15 @@
#include "replicas.h"
#include "super-io.h"

+#include <linux/sort.h>
+
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);

/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, size_t size)
+static int bch2_memcmp(const void *l, const void *r, const void *priv)
{
+ size_t size = (size_t) priv;
return memcmp(l, r, size);
}

@@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)

static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
- eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
+ eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
+ bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
}

static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
{
unsigned i;

- sort_cmp_size(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- bch2_memcmp, NULL);
+ sort_r(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ bch2_memcmp, NULL,
+ (void *)(size_t)cpu_r->entry_size);

for (i = 0; i < cpu_r->nr; i++) {
struct bch_replicas_entry_v1 *e =
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 654a4b26d3a3..983cce782ac2 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -3,9 +3,10 @@
#define _BCACHEFS_REPLICAS_H

#include "bkey.h"
-#include "eytzinger.h"
#include "replicas_types.h"

+#include <linux/eytzinger.h>
+
void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry_v1 *);
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 95e80e06316b..f37620919e11 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -3,12 +3,12 @@
#define _BCACHEFS_SUPER_IO_H

#include "extents.h"
-#include "eytzinger.h"
#include "super_types.h"
#include "super.h"
#include "sb-members.h"

#include <asm/byteorder.h>
+#include <linux/eytzinger.h>

static inline bool bch2_version_compatible(u16 version)
{
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index d7ea95abb9df..c7cf9c6fcf9a 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -11,6 +11,7 @@
#include <linux/console.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
+#include <linux/eytzinger.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/log2.h>
@@ -24,7 +25,6 @@
#include <linux/sched/clock.h>
#include <linux/mean_and_variance.h>

-#include "eytzinger.h"
#include "util.h"

static const char si_units[] = "?kMGTPEZY";
@@ -863,149 +863,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}

-static int alignment_ok(const void *base, size_t align)
-{
- return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
- ((unsigned long)base & (align - 1)) == 0;
-}
-
-static void u32_swap(void *a, void *b, size_t size)
-{
- u32 t = *(u32 *)a;
- *(u32 *)a = *(u32 *)b;
- *(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, size_t size)
-{
- u64 t = *(u64 *)a;
- *(u64 *)a = *(u64 *)b;
- *(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
- char t;
-
- do {
- t = *(char *)a;
- *(char *)a++ = *(char *)b;
- *(char *)b++ = t;
- } while (--size > 0);
-}
-
-static inline int do_cmp(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- size_t l, size_t r)
-{
- return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-static inline void do_swap(void *base, size_t n, size_t size,
- void (*swap_func)(void *, void *, size_t),
- size_t l, size_t r)
-{
- swap_func(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
- size);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t))
-{
- int i, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for (i = n / 2 - 1; i >= 0; --i) {
- for (r = i; r * 2 + 1 < n; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < n &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-
- /* sort */
- for (i = n - 1; i > 0; --i) {
- do_swap(base, n, size, swap_func, 0, i);
-
- for (r = 0; r * 2 + 1 < i; r = c) {
- c = r * 2 + 1;
-
- if (c + 1 < i &&
- do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
- c++;
-
- if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
- break;
-
- do_swap(base, n, size, swap_func, r, c);
- }
- }
-}
-
-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t size))
-{
- /* pre-scale counters for performance */
- int i = (num/2 - 1) * size, n = num * size, c, r;
-
- if (!swap_func) {
- if (size == 4 && alignment_ok(base, 4))
- swap_func = u32_swap;
- else if (size == 8 && alignment_ok(base, 8))
- swap_func = u64_swap;
- else
- swap_func = generic_swap;
- }
-
- /* heapify */
- for ( ; i >= 0; i -= size) {
- for (r = i; r * 2 + size < n; r = c) {
- c = r * 2 + size;
- if (c < n - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-
- /* sort */
- for (i = n - size; i > 0; i -= size) {
- swap_func(base, base + i, size);
- for (r = 0; r * 2 + size < i; r = c) {
- c = r * 2 + size;
- if (c < i - size &&
- cmp_func(base + c, base + c + size, size) < 0)
- c += size;
- if (cmp_func(base + r, base + c, size) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-}
-
static void mempool_free_vp(void *element, void *pool_data)
{
size_t size = (size_t) pool_data;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 0059481995ef..c3b11c3d24ea 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -737,10 +737,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
memset(s + bytes, c, rem);
}

-void sort_cmp_size(void *base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
-
/* just the memmove, doesn't update @_nr */
#define __array_insert_item(_array, _nr, _pos) \
memmove(&(_array)[(_pos) + 1], \
diff --git a/fs/bcachefs/eytzinger.h b/include/linux/eytzinger.h
similarity index 78%
rename from fs/bcachefs/eytzinger.h
rename to include/linux/eytzinger.h
index b04750dbf870..9565a5c26cd5 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/include/linux/eytzinger.h
@@ -1,27 +1,37 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _EYTZINGER_H
-#define _EYTZINGER_H
+#ifndef _LINUX_EYTZINGER_H
+#define _LINUX_EYTZINGER_H

#include <linux/bitops.h>
#include <linux/log2.h>

-#include "util.h"
+#ifdef EYTZINGER_DEBUG
+#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
+#else
+#define EYTZINGER_BUG_ON(cond)
+#endif

/*
* Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- */
-
-/*
- * One based indexing version:
+ * array.
*
- * With one based indexing each level of the tree starts at a power of two -
- * good for cacheline alignment:
+ * Consider using an eytzinger tree any time you would otherwise be doing binary
+ * search over an array. Binary search is a worst case scenario for branch
+ * prediction and prefetching, but in an eytzinger tree every node's children
+ * are adjacent in memory, thus we can prefetch children before knowing the
+ * result of the comparison, assuming multiple nodes fit on a cacheline.
+ *
+ * Two variants are provided, for one based indexing and zero based indexing.
+ *
+ * Zero based indexing is more convenient, but one based indexing has better
+ * alignment and thus better performance because each new level of the tree
+ * starts at a power of two, and thus if element 0 was cacheline aligned, each
+ * new level will be as well.
*/

static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);

return (i << 1) + child;
}
@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)

static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);

if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)

static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
- EBUG_ON(i > size);
+ EYTZINGER_BUG_ON(i > size);

if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned shift = __fls(size) - b;
int s;

- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);

i ^= 1U << b;
i <<= 1;
@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
unsigned shift;
int s;

- EBUG_ON(!i || i > size);
+ EYTZINGER_BUG_ON(!i || i > size);

/*
* sign bit trick:
@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)

static inline unsigned eytzinger0_child(unsigned i, unsigned child)
{
- EBUG_ON(child > 1);
+ EYTZINGER_BUG_ON(child > 1);

return (i << 1) + 1 + child;
}
@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))

-typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
-
/* return greatest node <= @search, or -1 if not found */
static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
- eytzinger_cmp_fn cmp, const void *search)
+ cmp_func_t cmp, const void *search)
{
unsigned i, n = 0;

@@ -244,7 +252,7 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,

do {
i = n;
- n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+ n = eytzinger0_child(i, cmp(search, base + i * size) >= 0);
} while (n < nr);

if (n & 1) {
@@ -274,8 +282,8 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
_i; \
})

-void eytzinger0_sort(void *, size_t, size_t,
- int (*cmp_func)(const void *, const void *, size_t),
- void (*swap_func)(void *, void *, size_t));
+void eytzinger0_sort_r(void *, size_t, size_t,
+ cmp_r_func_t, swap_r_func_t, const void *);
+void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);

-#endif /* _EYTZINGER_H */
+#endif /* _LINUX_EYTZINGER_H */
diff --git a/lib/sort.c b/lib/sort.c
index b399bf10d675..3dfa83d86bbb 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -290,3 +290,88 @@ void sort(void *base, size_t num, size_t size,
return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
}
EXPORT_SYMBOL(sort);
+
+#include <linux/eytzinger.h>
+
+static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func, const void *priv,
+ size_t l, size_t r)
+{
+ return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ cmp_func, priv);
+}
+
+static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+ swap_r_func_t swap_func, const void *priv,
+ size_t l, size_t r)
+{
+ do_swap(base + inorder_to_eytzinger0(l, n) * size,
+ base + inorder_to_eytzinger0(r, n) * size,
+ size, swap_func, priv);
+}
+
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
+{
+ int i, c, r;
+
+ if (!swap_func) {
+ if (is_aligned(base, size, 8))
+ swap_func = SWAP_WORDS_64;
+ else if (is_aligned(base, size, 4))
+ swap_func = SWAP_WORDS_32;
+ else
+ swap_func = SWAP_BYTES;
+ }
+
+ /* heapify */
+ for (i = n / 2 - 1; i >= 0; --i) {
+ for (r = i; r * 2 + 1 < n; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < n &&
+ eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+ c++;
+
+ if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+ break;
+
+ eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+ }
+ }
+
+ /* sort */
+ for (i = n - 1; i > 0; --i) {
+ eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+
+ for (r = 0; r * 2 + 1 < i; r = c) {
+ c = r * 2 + 1;
+
+ if (c + 1 < i &&
+ eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+ c++;
+
+ if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+ break;
+
+ eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(eytzinger0_sort_r);
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+ cmp_func_t cmp_func,
+ swap_func_t swap_func)
+{
+ struct wrapper w = {
+ .cmp = cmp_func,
+ .swap = swap_func,
+ };
+
+ return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+}
+EXPORT_SYMBOL_GPL(eytzinger0_sort);
--
2.43.0


2024-01-26 22:10:43

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 5/5] bcache: Convert to lib/time_stats

delete bcache's time stats code, convert to newer version from bcachefs.

example output:

root@moria-kvm:/sys/fs/bcache/bdaedb8c-4554-4dd2-87e4-276e51eb47cc# cat internal/btree_sort_times
count: 6414
since mount recent
duration of events
min: 440 ns
max: 1102 us
total: 674 ms
mean: 105 us 102 us
stddev: 101 us 88 us
time between events
min: 881 ns
max: 3 s
mean: 7 ms 6 ms
stddev: 52 ms 6 ms

Cc: Coly Li <[email protected]>
Cc: [email protected]
Signed-off-by: Kent Overstreet <[email protected]>
---
drivers/md/bcache/Kconfig | 1 +
drivers/md/bcache/bcache.h | 1 +
drivers/md/bcache/bset.c | 6 +++--
drivers/md/bcache/bset.h | 1 +
drivers/md/bcache/btree.c | 6 ++---
drivers/md/bcache/super.c | 7 +++++
drivers/md/bcache/sysfs.c | 25 +++++++++---------
drivers/md/bcache/util.c | 30 ----------------------
drivers/md/bcache/util.h | 52 +++++---------------------------------
9 files changed, 37 insertions(+), 92 deletions(-)

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index b2d10063d35f..7ea057983d3d 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -5,6 +5,7 @@ config BCACHE
select BLOCK_HOLDER_DEPRECATED if SYSFS
select CRC64
select CLOSURES
+ select TIME_STATS
help
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 6ae2329052c9..76e7b494c394 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -186,6 +186,7 @@
#include <linux/rbtree.h>
#include <linux/rwsem.h>
#include <linux/refcount.h>
+#include <linux/time_stats.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 2bba4d6aaaa2..31c08d4ab83b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1177,6 +1177,7 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,

void bch_bset_sort_state_free(struct bset_sort_state *state)
{
+ time_stats_exit(&state->time);
mempool_exit(&state->pool);
}

@@ -1184,6 +1185,7 @@ int bch_bset_sort_state_init(struct bset_sort_state *state,
unsigned int page_order)
{
spin_lock_init(&state->time.lock);
+ time_stats_init(&state->time);

state->page_order = page_order;
state->crit_factor = int_sqrt(1 << page_order);
@@ -1286,7 +1288,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
bch_bset_build_written_tree(b);

if (!start)
- bch_time_stats_update(&state->time, start_time);
+ time_stats_update(&state->time, start_time);
}

void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
@@ -1329,7 +1331,7 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,

btree_mergesort(b, new->set->data, &iter, false, true);

- bch_time_stats_update(&state->time, start_time);
+ time_stats_update(&state->time, start_time);

new->set->size = 0; // XXX: why?
}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index d795c84246b0..13e524ad7783 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -3,6 +3,7 @@
#define _BCACHE_BSET_H

#include <linux/kernel.h>
+#include <linux/time_stats.h>
#include <linux/types.h>

#include "bcache_ondisk.h"
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 196cdacce38f..0ed337c5f0dc 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -270,7 +270,7 @@ static void bch_btree_node_read(struct btree *b)
goto err;

bch_btree_node_read_done(b);
- bch_time_stats_update(&b->c->btree_read_time, start_time);
+ time_stats_update(&b->c->btree_read_time, start_time);

return;
err:
@@ -1852,7 +1852,7 @@ static void bch_btree_gc(struct cache_set *c)
bch_btree_gc_finish(c);
wake_up_allocators(c);

- bch_time_stats_update(&c->btree_gc_time, start_time);
+ time_stats_update(&c->btree_gc_time, start_time);

stats.key_bytes *= sizeof(uint64_t);
stats.data <<= 9;
@@ -2343,7 +2343,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
btree_node_free(b);
rw_unlock(true, n1);

- bch_time_stats_update(&b->c->btree_split_time, start_time);
+ time_stats_update(&b->c->btree_split_time, start_time);

return 0;
err_free2:
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index dc3f50f69714..625e4883299c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1676,6 +1676,9 @@ static CLOSURE_CALLBACK(cache_set_free)

debugfs_remove(c->debug);

+ time_stats_exit(&c->btree_read_time);
+ time_stats_exit(&c->btree_split_time);
+ time_stats_exit(&c->btree_gc_time);
bch_open_buckets_free(c);
bch_btree_cache_free(c);
bch_journal_free(c);
@@ -1913,6 +1916,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->data_buckets);

+ time_stats_init(&c->btree_gc_time);
+ time_stats_init(&c->btree_split_time);
+ time_stats_init(&c->btree_read_time);
+
iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
sizeof(struct btree_iter_set);

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index a438efb66069..01cc5c632f08 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -14,6 +14,7 @@
#include "features.h"

#include <linux/blkdev.h>
+#include <linux/seq_buf.h>
#include <linux/sort.h>
#include <linux/sched/clock.h>

@@ -79,10 +80,10 @@ read_attribute(active_journal_entries);
read_attribute(backing_dev_name);
read_attribute(backing_dev_uuid);

-sysfs_time_stats_attribute(btree_gc, sec, ms);
-sysfs_time_stats_attribute(btree_split, sec, us);
-sysfs_time_stats_attribute(btree_sort, ms, us);
-sysfs_time_stats_attribute(btree_read, ms, us);
+read_attribute(btree_gc_times);
+read_attribute(btree_split_times);
+read_attribute(btree_sort_times);
+read_attribute(btree_read_times);

read_attribute(btree_nodes);
read_attribute(btree_used_percent);
@@ -743,10 +744,10 @@ SHOW(__bch_cache_set)
sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c));
sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);

- sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
- sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
- sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
- sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
+ sysfs_print_time_stats(&c->btree_gc_time, btree_gc_times);
+ sysfs_print_time_stats(&c->btree_split_time, btree_split_times);
+ sysfs_print_time_stats(&c->sort.time, btree_sort_times);
+ sysfs_print_time_stats(&c->btree_read_time, btree_read_times);

sysfs_print(btree_used_percent, bch_btree_used(c));
sysfs_print(btree_nodes, c->gc_stats.nodes);
@@ -989,10 +990,10 @@ KTYPE(bch_cache_set);
static struct attribute *bch_cache_set_internal_attrs[] = {
&sysfs_active_journal_entries,

- sysfs_time_stats_attribute_list(btree_gc, sec, ms)
- sysfs_time_stats_attribute_list(btree_split, sec, us)
- sysfs_time_stats_attribute_list(btree_sort, ms, us)
- sysfs_time_stats_attribute_list(btree_read, ms, us)
+ &sysfs_btree_gc_times,
+ &sysfs_btree_split_times,
+ &sysfs_btree_sort_times,
+ &sysfs_btree_read_times,

&sysfs_btree_nodes,
&sysfs_btree_used_percent,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index ae380bc3992e..95282bf0f9a7 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -160,36 +160,6 @@ int bch_parse_uuid(const char *s, char *uuid)
return i;
}

-void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
-{
- uint64_t now, duration, last;
-
- spin_lock(&stats->lock);
-
- now = local_clock();
- duration = time_after64(now, start_time)
- ? now - start_time : 0;
- last = time_after64(now, stats->last)
- ? now - stats->last : 0;
-
- stats->max_duration = max(stats->max_duration, duration);
-
- if (stats->last) {
- ewma_add(stats->average_duration, duration, 8, 8);
-
- if (stats->average_frequency)
- ewma_add(stats->average_frequency, last, 8, 8);
- else
- stats->average_frequency = last << 8;
- } else {
- stats->average_duration = duration << 8;
- }
-
- stats->last = now ?: 1;
-
- spin_unlock(&stats->lock);
-}
-
/**
* bch_next_delay() - update ratelimiting statistics and calculate next delay
* @d: the struct bch_ratelimit to update
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index f61ab1bada6c..6fcb9db4f50d 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -344,20 +344,6 @@ ssize_t bch_hprint(char *buf, int64_t v);
bool bch_is_zero(const char *p, size_t n);
int bch_parse_uuid(const char *s, char *uuid);

-struct time_stats {
- spinlock_t lock;
- /*
- * all fields are in nanoseconds, averages are ewmas stored left shifted
- * by 8
- */
- uint64_t max_duration;
- uint64_t average_duration;
- uint64_t average_frequency;
- uint64_t last;
-};
-
-void bch_time_stats_update(struct time_stats *stats, uint64_t time);
-
static inline unsigned int local_clock_us(void)
{
return local_clock() >> 10;
@@ -372,40 +358,16 @@ static inline unsigned int local_clock_us(void)
sysfs_print(name ## _ ## stat ## _ ## units, \
div_u64((stats)->stat >> 8, NSEC_PER_ ## units))

-#define sysfs_print_time_stats(stats, name, \
- frequency_units, \
- duration_units) \
+#define sysfs_print_time_stats(stats, name) \
do { \
- __print_time_stat(stats, name, \
- average_frequency, frequency_units); \
- __print_time_stat(stats, name, \
- average_duration, duration_units); \
- sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
- div_u64((stats)->max_duration, \
- NSEC_PER_ ## duration_units)); \
- \
- sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
- ? div_s64(local_clock() - (stats)->last, \
- NSEC_PER_ ## frequency_units) \
- : -1LL); \
+ if (attr == &sysfs_##name) { \
+ struct seq_buf seq; \
+ seq_buf_init(&seq, buf, PAGE_SIZE); \
+ time_stats_to_seq_buf(&seq, stats); \
+ return seq.len; \
+ } \
} while (0)

-#define sysfs_time_stats_attribute(name, \
- frequency_units, \
- duration_units) \
-read_attribute(name ## _average_frequency_ ## frequency_units); \
-read_attribute(name ## _average_duration_ ## duration_units); \
-read_attribute(name ## _max_duration_ ## duration_units); \
-read_attribute(name ## _last_ ## frequency_units)
-
-#define sysfs_time_stats_attribute_list(name, \
- frequency_units, \
- duration_units) \
-&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
-&sysfs_ ## name ## _average_duration_ ## duration_units, \
-&sysfs_ ## name ## _max_duration_ ## duration_units, \
-&sysfs_ ## name ## _last_ ## frequency_units,
-
#define ewma_add(ewma, val, weight, factor) \
({ \
(ewma) *= (weight) - 1; \
--
2.43.0


2024-01-26 22:10:58

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 4/5] time_stats: Promote to lib/

Library code from bcachefs for tracking latency measurements.

The main interface is
time_stats_update(stats, start_time);

which collects a new event with an end time of the current time.

It features percpu buffering of input values, making it very low
overhead, and nicely formatted output to printbufs or seq_buf.

Sample output, from the bcache conversion:

root@moria-kvm:/sys/fs/bcache/bdaedb8c-4554-4dd2-87e4-276e51eb47cc# cat internal/btree_sort_times
count: 6414
since mount recent
duration of events
min: 440 ns
max: 1102 us
total: 674 ms
mean: 105 us 102 us
stddev: 101 us 88 us
time between events
min: 881 ns
max: 3 s
mean: 7 ms 6 ms
stddev: 52 ms 6 ms

Cc: Darrick J. Wong <[email protected]>
Cc: Dave Chinner <[email protected]>
Cc: Theodore Ts'o <[email protected]>
Cc: Coly Li <[email protected]>
Signed-off-by: Kent Overstreet <[email protected]>
---
fs/bcachefs/Kconfig | 2 +-
fs/bcachefs/alloc_foreground.c | 13 +-
fs/bcachefs/bcachefs.h | 11 +-
fs/bcachefs/btree_cache.c | 2 +-
fs/bcachefs/btree_gc.c | 2 +-
fs/bcachefs/btree_io.c | 8 +-
fs/bcachefs/btree_iter.c | 8 +-
fs/bcachefs/btree_locking.h | 2 +-
fs/bcachefs/btree_update_interior.c | 8 +-
fs/bcachefs/io_read.c | 4 +-
fs/bcachefs/io_write.c | 4 +-
fs/bcachefs/journal.c | 5 +-
fs/bcachefs/journal_io.c | 5 +-
fs/bcachefs/journal_reclaim.c | 9 +-
fs/bcachefs/journal_types.h | 11 +-
fs/bcachefs/nocow_locking.c | 2 +-
fs/bcachefs/super.c | 12 +-
fs/bcachefs/util.c | 262 +--------------------------
fs/bcachefs/util.h | 83 +--------
include/linux/time_stats.h | 134 ++++++++++++++
lib/Kconfig | 4 +
lib/Makefile | 2 +
lib/time_stats.c | 264 ++++++++++++++++++++++++++++
23 files changed, 455 insertions(+), 402 deletions(-)
create mode 100644 include/linux/time_stats.h
create mode 100644 lib/time_stats.c

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 72d1179262b3..8c587ddd2f85 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -24,7 +24,7 @@ config BCACHEFS_FS
select XXHASH
select SRCU
select SYMBOLIC_ERRNAME
- select MEAN_AND_VARIANCE
+ select TIME_STATS
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 633d3223b353..ca58193dd902 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
if (cl)
closure_wait(&c->open_buckets_wait, cl);

- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
- &c->blocked_allocate_open_bucket, true);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
spin_unlock(&c->freelist_lock);
return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
@@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
ca->nr_open_buckets++;
bch2_open_bucket_hash_add(c, ob);

- track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
- &c->blocked_allocate_open_bucket, false);
-
- track_event_change(&c->times[BCH_TIME_blocked_allocate],
- &c->blocked_allocate, false);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate], false);

spin_unlock(&c->freelist_lock);
return ob;
@@ -555,8 +551,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
goto again;
}

- track_event_change(&c->times[BCH_TIME_blocked_allocate],
- &c->blocked_allocate, true);
+ track_event_change(&c->times[BCH_TIME_blocked_allocate], true);

ob = ERR_PTR(-BCH_ERR_freelist_empty);
goto err;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b80c6c9efd8c..81070ee618bb 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -200,6 +200,7 @@
#include <linux/seqlock.h>
#include <linux/shrinker.h>
#include <linux/srcu.h>
+#include <linux/time_stats.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/zstd.h>
@@ -593,7 +594,7 @@ struct bch_dev {

/* The rest of this all shows up in sysfs */
atomic64_t cur_latency[2];
- struct bch2_time_stats io_latency[2];
+ struct time_stats io_latency[2];

#define CONGESTED_MAX 1024
atomic_t congested;
@@ -640,8 +641,8 @@ struct btree_debug {
#define BCH_TRANSACTIONS_NR 128

struct btree_transaction_stats {
- struct bch2_time_stats duration;
- struct bch2_time_stats lock_hold_times;
+ struct time_stats duration;
+ struct time_stats lock_hold_times;
struct mutex lock;
unsigned nr_max_paths;
unsigned journal_entries_size;
@@ -919,8 +920,6 @@ struct bch_fs {
/* ALLOCATOR */
spinlock_t freelist_lock;
struct closure_waitlist freelist_wait;
- u64 blocked_allocate;
- u64 blocked_allocate_open_bucket;

open_bucket_idx_t open_buckets_freelist;
open_bucket_idx_t open_buckets_nr_free;
@@ -1104,7 +1103,7 @@ struct bch_fs {
unsigned copy_gc_enabled:1;
bool promote_whole_extents;

- struct bch2_time_stats times[BCH_TIME_STAT_NR];
+ struct time_stats times[BCH_TIME_STAT_NR];

struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d7c81beac14a..8b3c04fc406f 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -648,7 +648,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
bch2_btree_keys_init(b);
set_btree_node_accessed(b);

- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+ time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);

memalloc_nofs_restore(flags);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1102995643b1..774df395e4c7 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1970,7 +1970,7 @@ int bch2_gc_gens(struct bch_fs *c)

c->gc_count++;

- bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+ time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
trace_and_count(c, gc_gens_end, c);
err:
for_each_member_device(c, ca) {
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index aa9b6cbe3226..a56dcabb7ace 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -327,7 +327,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);

if (sorting_entire_node)
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+ time_stats_update(&c->times[BCH_TIME_btree_node_sort],
start_time);

/* Make sure we preserve bset journal_seq: */
@@ -397,7 +397,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
&dst->format,
true);

- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
+ time_stats_update(&c->times[BCH_TIME_btree_node_sort],
start_time);

set_btree_bset_end(dst, dst->set);
@@ -1251,7 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
out:
mempool_free(iter, &c->fill_iter);
printbuf_exit(&buf);
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
+ time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
@@ -1323,7 +1323,7 @@ static void btree_node_read_work(struct work_struct *work)
}
}

- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+ time_stats_update(&c->times[BCH_TIME_btree_node_read],
rb->start_time);
bio_put(&rb->bio);

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5467a8635be1..f2d7b1dabcfb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2899,7 +2899,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)

if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
time_after64(now, trans->last_begin_time + 10))
- __bch2_time_stats_update(&btree_trans_stats(trans)->duration,
+ __time_stats_update(&btree_trans_stats(trans)->duration,
trans->last_begin_time, now);

if (!trans->restarted &&
@@ -3224,7 +3224,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
kfree(s->max_paths_text);
- bch2_time_stats_exit(&s->lock_hold_times);
+ time_stats_exit(&s->lock_hold_times);
}

if (c->btree_trans_barrier_initialized)
@@ -3240,8 +3240,8 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c)
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
- bch2_time_stats_init(&s->duration);
- bch2_time_stats_init(&s->lock_hold_times);
+ time_stats_init(&s->duration);
+ time_stats_init(&s->lock_hold_times);
mutex_init(&s->lock);
}

diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 4bd72c855da1..f2e2c5881b7e 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -122,7 +122,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
+ __time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
path->l[level].lock_taken_time,
local_clock());
#endif
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 17a5938aa71a..94b5df7c3d4d 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -516,7 +516,7 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
bch2_disk_reservation_put(c, &as->disk_res);
bch2_btree_reserve_put(as, trans);

- bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+ time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
as->start_time);

mutex_lock(&c->btree_interior_update_lock);
@@ -1038,7 +1038,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
continue_at(&as->cl, btree_update_set_nodes_written,
as->c->btree_interior_update_worker);

- bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+ time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
start_time);
}

@@ -1629,7 +1629,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,

bch2_trans_verify_locks(trans);

- bch2_time_stats_update(&c->times[n2
+ time_stats_update(&c->times[n2
? BCH_TIME_btree_node_split
: BCH_TIME_btree_node_compact],
start_time);
@@ -1935,7 +1935,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,

bch2_btree_update_done(as, trans);

- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
+ time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
out:
err:
if (new_path)
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 3c574d8873a1..dce136cd2271 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -134,7 +134,7 @@ static void promote_done(struct bch_write_op *wop)
container_of(wop, struct promote_op, write.op);
struct bch_fs *c = op->write.op.c;

- bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+ time_stats_update(&c->times[BCH_TIME_data_promote],
op->start_time);
promote_free(c, op);
}
@@ -356,7 +356,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
static void bch2_rbio_done(struct bch_read_bio *rbio)
{
if (rbio->start_time)
- bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+ time_stats_update(&rbio->c->times[BCH_TIME_data_read],
rbio->start_time);
bio_endio(&rbio->bio);
}
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index ef3a53f9045a..a1859cc326b3 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)

bch2_congested_acct(ca, io_latency, now, rw);

- __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+ __time_stats_update(&ca->io_latency[rw], submit_time, now);
}

#endif
@@ -457,7 +457,7 @@ static void bch2_write_done(struct closure *cl)

EBUG_ON(op->open_buckets.nr);

- bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+ time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
bch2_disk_reservation_put(c, &op->res);

if (!(op->flags & BCH_WRITE_MOVE))
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bc890776eb57..c5d6cc29be87 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -518,8 +518,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
ret = journal_entry_open(j);

if (ret == JOURNAL_ERR_max_in_flight) {
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
- &j->max_in_flight_start, true);
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true);
if (trace_journal_entry_full_enabled()) {
struct printbuf buf = PRINTBUF;
buf.atomic++;
@@ -727,7 +726,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));

if (!ret)
- bch2_time_stats_update(j->flush_seq_time, start_time);
+ time_stats_update(j->flush_seq_time, start_time);

return ret ?: ret2 < 0 ? ret2 : 0;
}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index bfd6585e746d..4bd78776f0d4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1574,7 +1574,7 @@ static CLOSURE_CALLBACK(journal_write_done)
u64 v, seq;
int err = 0;

- bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
+ time_stats_update(!JSET_NO_FLUSH(w->data)
? j->flush_write_time
: j->noflush_write_time, j->write_start_time);

@@ -1637,8 +1637,7 @@ static CLOSURE_CALLBACK(journal_write_done)
bch2_journal_reclaim_fast(j);
bch2_journal_space_available(j);

- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
- &j->max_in_flight_start, false);
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);

closure_wake_up(&w->wait);
journal_wake(j);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 820d25e19e5f..00f7f8199301 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j)
? BCH_WATERMARK_reclaim
: BCH_WATERMARK_stripe;

- if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
- &j->low_on_space_start, low_on_space) ||
- track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
- &j->low_on_pin_start, low_on_pin) ||
- track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
- &j->write_buffer_full_start, low_on_wb))
+ if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
+ track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
+ track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);

swap(watermark, j->watermark);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 38817c7a0851..b93e02c0a178 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -274,14 +274,9 @@ struct journal {
u64 nr_noflush_writes;
u64 entry_bytes_written;

- u64 low_on_space_start;
- u64 low_on_pin_start;
- u64 max_in_flight_start;
- u64 write_buffer_full_start;
-
- struct bch2_time_stats *flush_write_time;
- struct bch2_time_stats *noflush_write_time;
- struct bch2_time_stats *flush_seq_time;
+ struct time_stats *flush_write_time;
+ struct time_stats *noflush_write_time;
+ struct time_stats *flush_seq_time;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map res_map;
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 3c21981a4a1c..181efa4a83fa 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -85,7 +85,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
u64 start_time = local_clock();

__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
- bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
+ time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
}
}

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e74534096cb5..6498c8cbab1d 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -520,7 +520,7 @@ static void __bch2_fs_free(struct bch_fs *c)
unsigned i;

for (i = 0; i < BCH_TIME_STAT_NR; i++)
- bch2_time_stats_exit(&c->times[i]);
+ time_stats_exit(&c->times[i]);

bch2_free_pending_node_rewrites(c);
bch2_fs_sb_errors_exit(c);
@@ -753,7 +753,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal_keys.initial_ref_held = true;

for (i = 0; i < BCH_TIME_STAT_NR; i++)
- bch2_time_stats_init(&c->times[i]);
+ time_stats_init(&c->times[i]);

bch2_fs_copygc_init(c);
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
@@ -1168,8 +1168,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_dev_buckets_free(ca);
free_page((unsigned long) ca->sb_read_scratch);

- bch2_time_stats_exit(&ca->io_latency[WRITE]);
- bch2_time_stats_exit(&ca->io_latency[READ]);
+ time_stats_exit(&ca->io_latency[WRITE]);
+ time_stats_exit(&ca->io_latency[READ]);

percpu_ref_exit(&ca->io_ref);
percpu_ref_exit(&ca->ref);
@@ -1260,8 +1260,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,

INIT_WORK(&ca->io_error_work, bch2_io_error_work);

- bch2_time_stats_init(&ca->io_latency[READ]);
- bch2_time_stats_init(&ca->io_latency[WRITE]);
+ time_stats_init(&ca->io_latency[READ]);
+ time_stats_init(&ca->io_latency[WRITE]);
ca->io_latency[READ].quantiles_enabled = true;
ca->io_latency[WRITE].quantiles_enabled = true;

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index f2c8550c1331..a4c37ec66f23 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -337,32 +337,6 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
}
#endif

-static const struct time_unit {
- const char *name;
- u64 nsecs;
-} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "eon", U64_MAX },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
@@ -370,119 +344,7 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}

-/* time stats: */
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
-{
- unsigned i = 0;
-
- while (i < ARRAY_SIZE(q->entries)) {
- struct bch2_quantile_entry *e = q->entries + i;
-
- if (unlikely(!e->step)) {
- e->m = v;
- e->step = max_t(unsigned, v / 2, 1024);
- } else if (e->m > v) {
- e->m = e->m >= e->step
- ? e->m - e->step
- : 0;
- } else if (e->m < v) {
- e->m = e->m + e->step > e->m
- ? e->m + e->step
- : U32_MAX;
- }
-
- if ((e->m > v ? e->m - v : v - e->m) < e->step)
- e->step = max_t(unsigned, e->step / 2, 1);
-
- if (v >= e->m)
- break;
-
- i = eytzinger0_child(i, v > e->m);
- }
-}
-
-static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
- u64 start, u64 end)
-{
- u64 duration, freq;
-
- if (time_after64(end, start)) {
- duration = end - start;
- mean_and_variance_update(&stats->duration_stats, duration);
- mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
- stats->max_duration = max(stats->max_duration, duration);
- stats->min_duration = min(stats->min_duration, duration);
- stats->total_duration += duration;
- bch2_quantiles_update(&stats->quantiles, duration);
- }
-
- if (time_after64(end, stats->last_event)) {
- freq = end - stats->last_event;
- mean_and_variance_update(&stats->freq_stats, freq);
- mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
- stats->max_freq = max(stats->max_freq, freq);
- stats->min_freq = min(stats->min_freq, freq);
- stats->last_event = end;
- }
-}
-
-static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct bch2_time_stat_buffer *b)
-{
- for (struct bch2_time_stat_buffer_entry *i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- bch2_time_stats_update_one(stats, i->start, i->end);
- b->nr = 0;
-}
-
-static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
- struct bch2_time_stat_buffer *b)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&stats->lock, flags);
- __bch2_time_stats_clear_buffer(stats, b);
- spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
- unsigned long flags;
-
- WARN_ONCE(!stats->duration_stats_weighted.weight ||
- !stats->freq_stats_weighted.weight,
- "uninitialized time_stats");
-
- if (!stats->buffer) {
- spin_lock_irqsave(&stats->lock, flags);
- bch2_time_stats_update_one(stats, start, end);
-
- if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
- stats->duration_stats.n > 1024)
- stats->buffer =
- alloc_percpu_gfp(struct bch2_time_stat_buffer,
- GFP_ATOMIC);
- spin_unlock_irqrestore(&stats->lock, flags);
- } else {
- struct bch2_time_stat_buffer *b;
-
- preempt_disable();
- b = this_cpu_ptr(stats->buffer);
-
- BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
- b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
- .start = start,
- .end = end
- };
-
- if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
- bch2_time_stats_clear_buffer(stats, b);
- preempt_enable();
- }
-}
+/* time stats: printbuf output */

static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
@@ -503,7 +365,7 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64

#define TABSTOP_SIZE 12

-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
+void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
{
s64 f_mean = 0, d_mean = 0;
u64 f_stddev = 0, d_stddev = 0;
@@ -513,7 +375,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats

spin_lock_irq(&stats->lock);
for_each_possible_cpu(cpu)
- __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+ __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
spin_unlock_irq(&stats->lock);
}

@@ -624,124 +486,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
}
}

-#include <linux/seq_buf.h>
-
-static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
-{
- const struct time_unit *u = pick_time_units(ns);
-
- seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
-}
-
-void bch2_time_stats_to_seq_buf(struct seq_buf *out, struct bch2_time_stats *stats)
-{
- s64 f_mean = 0, d_mean = 0;
- u64 f_stddev = 0, d_stddev = 0;
-
- if (stats->buffer) {
- int cpu;
-
- spin_lock_irq(&stats->lock);
- for_each_possible_cpu(cpu)
- __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
- spin_unlock_irq(&stats->lock);
- }
-
- /*
- * avoid divide by zero
- */
- if (stats->freq_stats.n) {
- f_mean = mean_and_variance_get_mean(stats->freq_stats);
- f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
- d_mean = mean_and_variance_get_mean(stats->duration_stats);
- d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
- }
-
- seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
-
- seq_buf_printf(out, " since mount recent\n");
-
- seq_buf_printf(out, "duration of events\n");
-
- seq_buf_printf(out, " min: ");
- seq_buf_time_units_aligned(out, stats->min_duration);
- seq_buf_printf(out, "\n");
-
- seq_buf_printf(out, " max: ");
- seq_buf_time_units_aligned(out, stats->max_duration);
- seq_buf_printf(out, "\n");
-
- seq_buf_printf(out, " total: ");
- seq_buf_time_units_aligned(out, stats->total_duration);
- seq_buf_printf(out, "\n");
-
- seq_buf_printf(out, " mean: ");
- seq_buf_time_units_aligned(out, d_mean);
- seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
- seq_buf_printf(out, "\n");
-
- seq_buf_printf(out, " stddev: ");
- seq_buf_time_units_aligned(out, d_stddev);
- seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
- seq_buf_printf(out, "\n");
-
- seq_buf_printf(out, "time between events\n");
-
- seq_buf_printf(out, " min: ");
- seq_buf_time_units_aligned(out, stats->min_freq);
- seq_buf_printf(out, "\n");
-
- seq_buf_printf(out, " max: ");
- seq_buf_time_units_aligned(out, stats->max_freq);
- seq_buf_printf(out, "\n");
-
- seq_buf_printf(out, " mean: ");
- seq_buf_time_units_aligned(out, f_mean);
- seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
- seq_buf_printf(out, "\n");
-
- seq_buf_printf(out, " stddev: ");
- seq_buf_time_units_aligned(out, f_stddev);
- seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
- seq_buf_printf(out, "\n");
-
- if (stats->quantiles_enabled) {
- int i = eytzinger0_first(NR_QUANTILES);
- const struct time_unit *u =
- pick_time_units(stats->quantiles.entries[i].m);
- u64 last_q = 0;
-
- prt_printf(out, "quantiles (%s):\t", u->name);
- eytzinger0_for_each(i, NR_QUANTILES) {
- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
-
- u64 q = max(stats->quantiles.entries[i].m, last_q);
- seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
- if (is_last)
- seq_buf_printf(out, "\n");
- last_q = q;
- }
- }
-}
-#else
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
-#endif
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
- free_percpu(stats->buffer);
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
- memset(stats, 0, sizeof(*stats));
- stats->duration_stats_weighted.weight = 8;
- stats->freq_stats_weighted.weight = 8;
- stats->min_duration = U64_MAX;
- stats->min_freq = U64_MAX;
- spin_lock_init(&stats->lock);
-}
-
/* ratelimit: */

/**
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 7ff2d4fe26f6..cf8d16a91162 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -15,6 +15,7 @@
#include <linux/preempt.h>
#include <linux/ratelimit.h>
#include <linux/slab.h>
+#include <linux/time_stats.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/mean_and_variance.h>
@@ -360,87 +361,7 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
#endif
}

-#define NR_QUANTILES 15
-#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
-
-struct bch2_quantiles {
- struct bch2_quantile_entry {
- u64 m;
- u64 step;
- } entries[NR_QUANTILES];
-};
-
-struct bch2_time_stat_buffer {
- unsigned nr;
- struct bch2_time_stat_buffer_entry {
- u64 start;
- u64 end;
- } entries[32];
-};
-
-struct bch2_time_stats {
- spinlock_t lock;
- bool quantiles_enabled;
- /* all fields are in nanoseconds */
- u64 min_duration;
- u64 max_duration;
- u64 total_duration;
- u64 max_freq;
- u64 min_freq;
- u64 last_event;
- struct bch2_quantiles quantiles;
-
- struct mean_and_variance duration_stats;
- struct mean_and_variance_weighted duration_stats_weighted;
- struct mean_and_variance freq_stats;
- struct mean_and_variance_weighted freq_stats_weighted;
- struct bch2_time_stat_buffer __percpu *buffer;
-};
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
- __bch2_time_stats_update(stats, start, local_clock());
-}
-
-static inline bool track_event_change(struct bch2_time_stats *stats,
- u64 *start, bool v)
-{
- if (v != !!*start) {
- if (!v) {
- bch2_time_stats_update(stats, *start);
- *start = 0;
- } else {
- *start = local_clock() ?: 1;
- return true;
- }
- }
-
- return false;
-}
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
-static inline bool track_event_change(struct bch2_time_stats *stats,
- u64 *start, bool v)
-{
- bool ret = v && !*start;
- *start = v;
- return ret;
-}
-#endif
-
-void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
-
-struct seq_buf;
-void bch2_time_stats_to_seq_buf(struct seq_buf *, struct bch2_time_stats *);
-
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
+void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);

#define ewma_add(ewma, val, weight) \
({ \
diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
new file mode 100644
index 000000000000..caefa7aba65a
--- /dev/null
+++ b/include/linux/time_stats.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * time_stats - collect statistics on events that have a duration, with nicely
+ * formatted textual output on demand
+ *
+ * - percpu buffering of event collection: cheap enough to shotgun
+ * everywhere without worrying about overhead
+ *
+ * tracks:
+ * - number of events
+ * - maximum event duration ever seen
+ * - sum of all event durations
+ * - average event duration, standard and weighted
+ * - standard deviation of event durations, standard and weighted
+ * and analagous statistics for the frequency of events
+ *
+ * We provide both mean and weighted mean (exponentially weighted), and standard
+ * deviation and weighted standard deviation, to give an efficient-to-compute
+ * view of current behaviour versus. average behaviour - "did this event source
+ * just become wonky, or is this typical?".
+ *
+ * Particularly useful for tracking down latency issues.
+ */
+#ifndef _LINUX_TIME_STATS_H
+#define _LINUX_TIME_STATS_H
+
+#include <linux/mean_and_variance.h>
+#include <linux/sched/clock.h>
+#include <linux/spinlock_types.h>
+
+struct time_unit {
+ const char *name;
+ u64 nsecs;
+};
+
+/*
+ * given a nanosecond value, pick the preferred time units for printing:
+ */
+const struct time_unit *pick_time_units(u64 ns);
+
+/*
+ * quantiles - do not use:
+ *
+ * Only enabled if time_stats->quantiles_enabled has been manually set - don't
+ * use in new code.
+ */
+
+#define NR_QUANTILES 15
+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+ struct quantile_entry {
+ u64 m;
+ u64 step;
+ } entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+ unsigned nr;
+ struct time_stat_buffer_entry {
+ u64 start;
+ u64 end;
+ } entries[32];
+};
+
+struct time_stats {
+ spinlock_t lock;
+ bool quantiles_enabled;
+ /* all fields are in nanoseconds */
+ u64 min_duration;
+ u64 max_duration;
+ u64 total_duration;
+ u64 max_freq;
+ u64 min_freq;
+ u64 last_event;
+ u64 last_event_start;
+ struct quantiles quantiles;
+
+ struct mean_and_variance duration_stats;
+ struct mean_and_variance_weighted duration_stats_weighted;
+ struct mean_and_variance freq_stats;
+ struct mean_and_variance_weighted freq_stats_weighted;
+ struct time_stat_buffer __percpu *buffer;
+};
+
+void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *);
+void __time_stats_update(struct time_stats *stats, u64, u64);
+
+/**
+ * time_stats_update - collect a new event being tracked
+ *
+ * @stats - time_stats to update
+ * @start - start time of event, recorded with local_clock()
+ *
+ * The end duration of the event will be the current time
+ */
+static inline void time_stats_update(struct time_stats *stats, u64 start)
+{
+ __time_stats_update(stats, start, local_clock());
+}
+
+/**
+ * track_event_change - track state change events
+ *
+ * @stats - time_stats to update
+ * @v - new state, true or false
+ *
+ * Use this when tracking time stats for state changes, i.e. resource X becoming
+ * blocked/unblocked.
+ */
+static inline bool track_event_change(struct time_stats *stats, bool v)
+{
+ if (v != !!stats->last_event_start) {
+ if (!v) {
+ time_stats_update(stats, stats->last_event_start);
+ stats->last_event_start = 0;
+ } else {
+ stats->last_event_start = local_clock() ?: 1;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+struct seq_buf;
+void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *);
+
+void time_stats_exit(struct time_stats *);
+void time_stats_init(struct time_stats *);
+
+#endif /* _LINUX_TIME_STATS_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 5ddda7c2ed9b..3ba8b965f8c7 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -785,3 +785,7 @@ config POLYNOMIAL

config FIRMWARE_TABLE
bool
+
+config TIME_STATS
+ tristate
+ select MEAN_AND_VARIANCE
diff --git a/lib/Makefile b/lib/Makefile
index 6b09731d8e61..57858997c87a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -370,6 +370,8 @@ obj-$(CONFIG_SBITMAP) += sbitmap.o

obj-$(CONFIG_PARMAN) += parman.o

+obj-$(CONFIG_TIME_STATS) += time_stats.o
+
obj-y += group_cpus.o

# GCC library routines
diff --git a/lib/time_stats.c b/lib/time_stats.c
new file mode 100644
index 000000000000..a5b9f149e2c6
--- /dev/null
+++ b/lib/time_stats.c
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/eytzinger.h>
+#include <linux/jiffies.h>
+#include <linux/percpu.h>
+#include <linux/time.h>
+#include <linux/time_stats.h>
+#include <linux/spinlock.h>
+
+static const struct time_unit time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "eon", U64_MAX },
+};
+
+const struct time_unit *pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+ unsigned i = 0;
+
+ while (i < ARRAY_SIZE(q->entries)) {
+ struct quantile_entry *e = q->entries + i;
+
+ if (unlikely(!e->step)) {
+ e->m = v;
+ e->step = max_t(unsigned, v / 2, 1024);
+ } else if (e->m > v) {
+ e->m = e->m >= e->step
+ ? e->m - e->step
+ : 0;
+ } else if (e->m < v) {
+ e->m = e->m + e->step > e->m
+ ? e->m + e->step
+ : U32_MAX;
+ }
+
+ if ((e->m > v ? e->m - v : v - e->m) < e->step)
+ e->step = max_t(unsigned, e->step / 2, 1);
+
+ if (v >= e->m)
+ break;
+
+ i = eytzinger0_child(i, v > e->m);
+ }
+}
+
+static inline void time_stats_update_one(struct time_stats *stats,
+ u64 start, u64 end)
+{
+ u64 duration, freq;
+
+ if (time_after64(end, start)) {
+ duration = end - start;
+ mean_and_variance_update(&stats->duration_stats, duration);
+ mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
+ stats->max_duration = max(stats->max_duration, duration);
+ stats->min_duration = min(stats->min_duration, duration);
+ stats->total_duration += duration;
+
+ if (stats->quantiles_enabled)
+ quantiles_update(&stats->quantiles, duration);
+ }
+
+ if (time_after64(end, stats->last_event)) {
+ freq = end - stats->last_event;
+ mean_and_variance_update(&stats->freq_stats, freq);
+ mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
+ stats->max_freq = max(stats->max_freq, freq);
+ stats->min_freq = min(stats->min_freq, freq);
+ stats->last_event = end;
+ }
+}
+
+void __time_stats_clear_buffer(struct time_stats *stats,
+ struct time_stat_buffer *b)
+{
+ for (struct time_stat_buffer_entry *i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ time_stats_update_one(stats, i->start, i->end);
+ b->nr = 0;
+}
+EXPORT_SYMBOL_GPL(__time_stats_clear_buffer);
+
+static noinline void time_stats_clear_buffer(struct time_stats *stats,
+ struct time_stat_buffer *b)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&stats->lock, flags);
+ __time_stats_clear_buffer(stats, b);
+ spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+void __time_stats_update(struct time_stats *stats, u64 start, u64 end)
+{
+ unsigned long flags;
+
+ WARN_ONCE(!stats->duration_stats_weighted.weight ||
+ !stats->freq_stats_weighted.weight,
+ "uninitialized time_stats");
+
+ if (!stats->buffer) {
+ spin_lock_irqsave(&stats->lock, flags);
+ time_stats_update_one(stats, start, end);
+
+ if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
+ stats->duration_stats.n > 1024)
+ stats->buffer =
+ alloc_percpu_gfp(struct time_stat_buffer,
+ GFP_ATOMIC);
+ spin_unlock_irqrestore(&stats->lock, flags);
+ } else {
+ struct time_stat_buffer *b;
+
+ preempt_disable();
+ b = this_cpu_ptr(stats->buffer);
+
+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+ b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+ .start = start,
+ .end = end
+ };
+
+ if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+ time_stats_clear_buffer(stats, b);
+ preempt_enable();
+ }
+}
+EXPORT_SYMBOL_GPL(__time_stats_update);
+
+#include <linux/seq_buf.h>
+
+static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
+{
+ const struct time_unit *u = pick_time_units(ns);
+
+ seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
+}
+
+void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats)
+{
+ s64 f_mean = 0, d_mean = 0;
+ u64 f_stddev = 0, d_stddev = 0;
+
+ if (stats->buffer) {
+ int cpu;
+
+ spin_lock_irq(&stats->lock);
+ for_each_possible_cpu(cpu)
+ __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+ spin_unlock_irq(&stats->lock);
+ }
+
+ /*
+ * avoid divide by zero
+ */
+ if (stats->freq_stats.n) {
+ f_mean = mean_and_variance_get_mean(stats->freq_stats);
+ f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
+ d_mean = mean_and_variance_get_mean(stats->duration_stats);
+ d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
+ }
+
+ seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
+
+ seq_buf_printf(out, " since mount recent\n");
+
+ seq_buf_printf(out, "duration of events\n");
+
+ seq_buf_printf(out, " min: ");
+ seq_buf_time_units_aligned(out, stats->min_duration);
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " max: ");
+ seq_buf_time_units_aligned(out, stats->max_duration);
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " total: ");
+ seq_buf_time_units_aligned(out, stats->total_duration);
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " mean: ");
+ seq_buf_time_units_aligned(out, d_mean);
+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " stddev: ");
+ seq_buf_time_units_aligned(out, d_stddev);
+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, "time between events\n");
+
+ seq_buf_printf(out, " min: ");
+ seq_buf_time_units_aligned(out, stats->min_freq);
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " max: ");
+ seq_buf_time_units_aligned(out, stats->max_freq);
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " mean: ");
+ seq_buf_time_units_aligned(out, f_mean);
+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+ seq_buf_printf(out, "\n");
+
+ seq_buf_printf(out, " stddev: ");
+ seq_buf_time_units_aligned(out, f_stddev);
+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+ seq_buf_printf(out, "\n");
+
+ if (stats->quantiles_enabled) {
+ int i = eytzinger0_first(NR_QUANTILES);
+ const struct time_unit *u =
+ pick_time_units(stats->quantiles.entries[i].m);
+ u64 last_q = 0;
+
+ seq_buf_printf(out, "quantiles (%s):\t", u->name);
+ eytzinger0_for_each(i, NR_QUANTILES) {
+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+ u64 q = max(stats->quantiles.entries[i].m, last_q);
+ seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
+ if (is_last)
+ seq_buf_printf(out, "\n");
+ last_q = q;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(time_stats_to_seq_buf);
+
+void time_stats_exit(struct time_stats *stats)
+{
+ free_percpu(stats->buffer);
+}
+EXPORT_SYMBOL_GPL(time_stats_exit);
+
+void time_stats_init(struct time_stats *stats)
+{
+ memset(stats, 0, sizeof(*stats));
+ stats->duration_stats_weighted.weight = 8;
+ stats->freq_stats_weighted.weight = 8;
+ stats->min_duration = U64_MAX;
+ stats->min_freq = U64_MAX;
+ spin_lock_init(&stats->lock);
+}
+EXPORT_SYMBOL_GPL(time_stats_init);
--
2.43.0


2024-01-27 01:56:52

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 4/5] time_stats: Promote to lib/

On Fri, Jan 26, 2024 at 05:06:54PM -0500, Kent Overstreet wrote:
> Library code from bcachefs for tracking latency measurements.
>
> The main interface is
> time_stats_update(stats, start_time);
>
> which collects a new event with an end time of the current time.
>
> It features percpu buffering of input values, making it very low
> overhead, and nicely formatted output to printbufs or seq_buf.
>
> Sample output, from the bcache conversion:
>
> root@moria-kvm:/sys/fs/bcache/bdaedb8c-4554-4dd2-87e4-276e51eb47cc# cat internal/btree_sort_times
> count: 6414
> since mount recent
> duration of events
> min: 440 ns
> max: 1102 us
> total: 674 ms
> mean: 105 us 102 us
> stddev: 101 us 88 us
> time between events
> min: 881 ns
> max: 3 s
> mean: 7 ms 6 ms
> stddev: 52 ms 6 ms
>
> Cc: Darrick J. Wong <[email protected]>
> Cc: Dave Chinner <[email protected]>
> Cc: Theodore Ts'o <[email protected]>
> Cc: Coly Li <[email protected]>
> Signed-off-by: Kent Overstreet <[email protected]>

Hmm. Ok, so, if I wanted to report timestats information for, say,
xfs_log_reserve, I would do ... what?

Embed a struct time_stats into struct xlog;

Call time_stats_init on it when we create the xlog object (and
time_stats_exit when we're tearing down the log)

Then, in xfs_log_reserve, we'd do something like:

u64 start_time;

...

trace_xfs_log_reserve(log, tic);
start_time = local_clock(); /* new */

error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
&need_bytes);
if (error)
goto out_error;

xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
trace_xfs_log_reserve_exit(log, tic);
time_stats_update(&log->l_grant_stall, start_time); /* new */
xlog_verify_grant_tail(log);

And then we create a debugfs file somewhere and make its
xlog_grant_head_stall_read function do something along the lines of:

char buf[PAGE_SIZE];
struct xlog *xlog = file->private_data;

seq_buf_init(&s, buf, sizeof(buf));
time_stats_to_seq_buf(&s, &xlog->l_grant_stall);

return simple_read_from_buffer(ubuf, count, ppos, buf,
seq_buf_used(&s));

This will get us a report of the number of times we've added to the
reserve and write heads; various stats about how much time we've spent
waiting for the grants, and how much time goes by between the two?

The mean time between events might be much more useful for tracking time
between online repair events though.

Or, we could put a couple of these into struct xfs_perag for the AGF and
AGI buffers and use track_event_change to keep track of the time spent
with the AGF/AGI locked vs. unlocked. Maybe it would be more useful to
track the time that the AG[FI] buffer locks are contended, vs. merely
held by someone in the system.

If my understanding of all this is correct, then this looks promising
for augmenting pcp a bit;
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/bcachefs/Kconfig | 2 +-
> fs/bcachefs/alloc_foreground.c | 13 +-
> fs/bcachefs/bcachefs.h | 11 +-
> fs/bcachefs/btree_cache.c | 2 +-
> fs/bcachefs/btree_gc.c | 2 +-
> fs/bcachefs/btree_io.c | 8 +-
> fs/bcachefs/btree_iter.c | 8 +-
> fs/bcachefs/btree_locking.h | 2 +-
> fs/bcachefs/btree_update_interior.c | 8 +-
> fs/bcachefs/io_read.c | 4 +-
> fs/bcachefs/io_write.c | 4 +-
> fs/bcachefs/journal.c | 5 +-
> fs/bcachefs/journal_io.c | 5 +-
> fs/bcachefs/journal_reclaim.c | 9 +-
> fs/bcachefs/journal_types.h | 11 +-
> fs/bcachefs/nocow_locking.c | 2 +-
> fs/bcachefs/super.c | 12 +-
> fs/bcachefs/util.c | 262 +--------------------------
> fs/bcachefs/util.h | 83 +--------
> include/linux/time_stats.h | 134 ++++++++++++++
> lib/Kconfig | 4 +
> lib/Makefile | 2 +
> lib/time_stats.c | 264 ++++++++++++++++++++++++++++
> 23 files changed, 455 insertions(+), 402 deletions(-)
> create mode 100644 include/linux/time_stats.h
> create mode 100644 lib/time_stats.c
>
> diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
> index 72d1179262b3..8c587ddd2f85 100644
> --- a/fs/bcachefs/Kconfig
> +++ b/fs/bcachefs/Kconfig
> @@ -24,7 +24,7 @@ config BCACHEFS_FS
> select XXHASH
> select SRCU
> select SYMBOLIC_ERRNAME
> - select MEAN_AND_VARIANCE
> + select TIME_STATS
> help
> The bcachefs filesystem - a modern, copy on write filesystem, with
> support for multiple devices, compression, checksumming, etc.
> diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
> index 633d3223b353..ca58193dd902 100644
> --- a/fs/bcachefs/alloc_foreground.c
> +++ b/fs/bcachefs/alloc_foreground.c
> @@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
> if (cl)
> closure_wait(&c->open_buckets_wait, cl);
>
> - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
> - &c->blocked_allocate_open_bucket, true);
> + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
> spin_unlock(&c->freelist_lock);
> return ERR_PTR(-BCH_ERR_open_buckets_empty);
> }
> @@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
> ca->nr_open_buckets++;
> bch2_open_bucket_hash_add(c, ob);
>
> - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
> - &c->blocked_allocate_open_bucket, false);
> -
> - track_event_change(&c->times[BCH_TIME_blocked_allocate],
> - &c->blocked_allocate, false);
> + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
> + track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
>
> spin_unlock(&c->freelist_lock);
> return ob;
> @@ -555,8 +551,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
> goto again;
> }
>
> - track_event_change(&c->times[BCH_TIME_blocked_allocate],
> - &c->blocked_allocate, true);
> + track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
>
> ob = ERR_PTR(-BCH_ERR_freelist_empty);
> goto err;
> diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
> index b80c6c9efd8c..81070ee618bb 100644
> --- a/fs/bcachefs/bcachefs.h
> +++ b/fs/bcachefs/bcachefs.h
> @@ -200,6 +200,7 @@
> #include <linux/seqlock.h>
> #include <linux/shrinker.h>
> #include <linux/srcu.h>
> +#include <linux/time_stats.h>
> #include <linux/types.h>
> #include <linux/workqueue.h>
> #include <linux/zstd.h>
> @@ -593,7 +594,7 @@ struct bch_dev {
>
> /* The rest of this all shows up in sysfs */
> atomic64_t cur_latency[2];
> - struct bch2_time_stats io_latency[2];
> + struct time_stats io_latency[2];
>
> #define CONGESTED_MAX 1024
> atomic_t congested;
> @@ -640,8 +641,8 @@ struct btree_debug {
> #define BCH_TRANSACTIONS_NR 128
>
> struct btree_transaction_stats {
> - struct bch2_time_stats duration;
> - struct bch2_time_stats lock_hold_times;
> + struct time_stats duration;
> + struct time_stats lock_hold_times;
> struct mutex lock;
> unsigned nr_max_paths;
> unsigned journal_entries_size;
> @@ -919,8 +920,6 @@ struct bch_fs {
> /* ALLOCATOR */
> spinlock_t freelist_lock;
> struct closure_waitlist freelist_wait;
> - u64 blocked_allocate;
> - u64 blocked_allocate_open_bucket;
>
> open_bucket_idx_t open_buckets_freelist;
> open_bucket_idx_t open_buckets_nr_free;
> @@ -1104,7 +1103,7 @@ struct bch_fs {
> unsigned copy_gc_enabled:1;
> bool promote_whole_extents;
>
> - struct bch2_time_stats times[BCH_TIME_STAT_NR];
> + struct time_stats times[BCH_TIME_STAT_NR];
>
> struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
>
> diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
> index d7c81beac14a..8b3c04fc406f 100644
> --- a/fs/bcachefs/btree_cache.c
> +++ b/fs/bcachefs/btree_cache.c
> @@ -648,7 +648,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
> bch2_btree_keys_init(b);
> set_btree_node_accessed(b);
>
> - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
> + time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
> start_time);
>
> memalloc_nofs_restore(flags);
> diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
> index 1102995643b1..774df395e4c7 100644
> --- a/fs/bcachefs/btree_gc.c
> +++ b/fs/bcachefs/btree_gc.c
> @@ -1970,7 +1970,7 @@ int bch2_gc_gens(struct bch_fs *c)
>
> c->gc_count++;
>
> - bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
> + time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
> trace_and_count(c, gc_gens_end, c);
> err:
> for_each_member_device(c, ca) {
> diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
> index aa9b6cbe3226..a56dcabb7ace 100644
> --- a/fs/bcachefs/btree_io.c
> +++ b/fs/bcachefs/btree_io.c
> @@ -327,7 +327,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
> BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
>
> if (sorting_entire_node)
> - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
> + time_stats_update(&c->times[BCH_TIME_btree_node_sort],
> start_time);
>
> /* Make sure we preserve bset journal_seq: */
> @@ -397,7 +397,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
> &dst->format,
> true);
>
> - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
> + time_stats_update(&c->times[BCH_TIME_btree_node_sort],
> start_time);
>
> set_btree_bset_end(dst, dst->set);
> @@ -1251,7 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
> out:
> mempool_free(iter, &c->fill_iter);
> printbuf_exit(&buf);
> - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
> + time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
> return retry_read;
> fsck_err:
> if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
> @@ -1323,7 +1323,7 @@ static void btree_node_read_work(struct work_struct *work)
> }
> }
>
> - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
> + time_stats_update(&c->times[BCH_TIME_btree_node_read],
> rb->start_time);
> bio_put(&rb->bio);
>
> diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
> index 5467a8635be1..f2d7b1dabcfb 100644
> --- a/fs/bcachefs/btree_iter.c
> +++ b/fs/bcachefs/btree_iter.c
> @@ -2899,7 +2899,7 @@ u32 bch2_trans_begin(struct btree_trans *trans)
>
> if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
> time_after64(now, trans->last_begin_time + 10))
> - __bch2_time_stats_update(&btree_trans_stats(trans)->duration,
> + __time_stats_update(&btree_trans_stats(trans)->duration,
> trans->last_begin_time, now);
>
> if (!trans->restarted &&
> @@ -3224,7 +3224,7 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
> s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
> s++) {
> kfree(s->max_paths_text);
> - bch2_time_stats_exit(&s->lock_hold_times);
> + time_stats_exit(&s->lock_hold_times);
> }
>
> if (c->btree_trans_barrier_initialized)
> @@ -3240,8 +3240,8 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c)
> for (s = c->btree_transaction_stats;
> s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
> s++) {
> - bch2_time_stats_init(&s->duration);
> - bch2_time_stats_init(&s->lock_hold_times);
> + time_stats_init(&s->duration);
> + time_stats_init(&s->lock_hold_times);
> mutex_init(&s->lock);
> }
>
> diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
> index 4bd72c855da1..f2e2c5881b7e 100644
> --- a/fs/bcachefs/btree_locking.h
> +++ b/fs/bcachefs/btree_locking.h
> @@ -122,7 +122,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
> struct btree_path *path, unsigned level)
> {
> #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
> - __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
> + __time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
> path->l[level].lock_taken_time,
> local_clock());
> #endif
> diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
> index 17a5938aa71a..94b5df7c3d4d 100644
> --- a/fs/bcachefs/btree_update_interior.c
> +++ b/fs/bcachefs/btree_update_interior.c
> @@ -516,7 +516,7 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
> bch2_disk_reservation_put(c, &as->disk_res);
> bch2_btree_reserve_put(as, trans);
>
> - bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
> + time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
> as->start_time);
>
> mutex_lock(&c->btree_interior_update_lock);
> @@ -1038,7 +1038,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
> continue_at(&as->cl, btree_update_set_nodes_written,
> as->c->btree_interior_update_worker);
>
> - bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
> + time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
> start_time);
> }
>
> @@ -1629,7 +1629,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
>
> bch2_trans_verify_locks(trans);
>
> - bch2_time_stats_update(&c->times[n2
> + time_stats_update(&c->times[n2
> ? BCH_TIME_btree_node_split
> : BCH_TIME_btree_node_compact],
> start_time);
> @@ -1935,7 +1935,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
>
> bch2_btree_update_done(as, trans);
>
> - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
> + time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
> out:
> err:
> if (new_path)
> diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
> index 3c574d8873a1..dce136cd2271 100644
> --- a/fs/bcachefs/io_read.c
> +++ b/fs/bcachefs/io_read.c
> @@ -134,7 +134,7 @@ static void promote_done(struct bch_write_op *wop)
> container_of(wop, struct promote_op, write.op);
> struct bch_fs *c = op->write.op.c;
>
> - bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
> + time_stats_update(&c->times[BCH_TIME_data_promote],
> op->start_time);
> promote_free(c, op);
> }
> @@ -356,7 +356,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
> static void bch2_rbio_done(struct bch_read_bio *rbio)
> {
> if (rbio->start_time)
> - bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
> + time_stats_update(&rbio->c->times[BCH_TIME_data_read],
> rbio->start_time);
> bio_endio(&rbio->bio);
> }
> diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
> index ef3a53f9045a..a1859cc326b3 100644
> --- a/fs/bcachefs/io_write.c
> +++ b/fs/bcachefs/io_write.c
> @@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
>
> bch2_congested_acct(ca, io_latency, now, rw);
>
> - __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
> + __time_stats_update(&ca->io_latency[rw], submit_time, now);
> }
>
> #endif
> @@ -457,7 +457,7 @@ static void bch2_write_done(struct closure *cl)
>
> EBUG_ON(op->open_buckets.nr);
>
> - bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
> + time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
> bch2_disk_reservation_put(c, &op->res);
>
> if (!(op->flags & BCH_WRITE_MOVE))
> diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
> index bc890776eb57..c5d6cc29be87 100644
> --- a/fs/bcachefs/journal.c
> +++ b/fs/bcachefs/journal.c
> @@ -518,8 +518,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
> ret = journal_entry_open(j);
>
> if (ret == JOURNAL_ERR_max_in_flight) {
> - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
> - &j->max_in_flight_start, true);
> + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true);
> if (trace_journal_entry_full_enabled()) {
> struct printbuf buf = PRINTBUF;
> buf.atomic++;
> @@ -727,7 +726,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
> ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
>
> if (!ret)
> - bch2_time_stats_update(j->flush_seq_time, start_time);
> + time_stats_update(j->flush_seq_time, start_time);
>
> return ret ?: ret2 < 0 ? ret2 : 0;
> }
> diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
> index bfd6585e746d..4bd78776f0d4 100644
> --- a/fs/bcachefs/journal_io.c
> +++ b/fs/bcachefs/journal_io.c
> @@ -1574,7 +1574,7 @@ static CLOSURE_CALLBACK(journal_write_done)
> u64 v, seq;
> int err = 0;
>
> - bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
> + time_stats_update(!JSET_NO_FLUSH(w->data)
> ? j->flush_write_time
> : j->noflush_write_time, j->write_start_time);
>
> @@ -1637,8 +1637,7 @@ static CLOSURE_CALLBACK(journal_write_done)
> bch2_journal_reclaim_fast(j);
> bch2_journal_space_available(j);
>
> - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
> - &j->max_in_flight_start, false);
> + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
>
> closure_wake_up(&w->wait);
> journal_wake(j);
> diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
> index 820d25e19e5f..00f7f8199301 100644
> --- a/fs/bcachefs/journal_reclaim.c
> +++ b/fs/bcachefs/journal_reclaim.c
> @@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j)
> ? BCH_WATERMARK_reclaim
> : BCH_WATERMARK_stripe;
>
> - if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
> - &j->low_on_space_start, low_on_space) ||
> - track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
> - &j->low_on_pin_start, low_on_pin) ||
> - track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
> - &j->write_buffer_full_start, low_on_wb))
> + if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
> + track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
> + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
> trace_and_count(c, journal_full, c);
>
> swap(watermark, j->watermark);
> diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
> index 38817c7a0851..b93e02c0a178 100644
> --- a/fs/bcachefs/journal_types.h
> +++ b/fs/bcachefs/journal_types.h
> @@ -274,14 +274,9 @@ struct journal {
> u64 nr_noflush_writes;
> u64 entry_bytes_written;
>
> - u64 low_on_space_start;
> - u64 low_on_pin_start;
> - u64 max_in_flight_start;
> - u64 write_buffer_full_start;
> -
> - struct bch2_time_stats *flush_write_time;
> - struct bch2_time_stats *noflush_write_time;
> - struct bch2_time_stats *flush_seq_time;
> + struct time_stats *flush_write_time;
> + struct time_stats *noflush_write_time;
> + struct time_stats *flush_seq_time;
>
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> struct lockdep_map res_map;
> diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
> index 3c21981a4a1c..181efa4a83fa 100644
> --- a/fs/bcachefs/nocow_locking.c
> +++ b/fs/bcachefs/nocow_locking.c
> @@ -85,7 +85,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
> u64 start_time = local_clock();
>
> __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
> - bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
> + time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
> }
> }
>
> diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
> index e74534096cb5..6498c8cbab1d 100644
> --- a/fs/bcachefs/super.c
> +++ b/fs/bcachefs/super.c
> @@ -520,7 +520,7 @@ static void __bch2_fs_free(struct bch_fs *c)
> unsigned i;
>
> for (i = 0; i < BCH_TIME_STAT_NR; i++)
> - bch2_time_stats_exit(&c->times[i]);
> + time_stats_exit(&c->times[i]);
>
> bch2_free_pending_node_rewrites(c);
> bch2_fs_sb_errors_exit(c);
> @@ -753,7 +753,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
> c->journal_keys.initial_ref_held = true;
>
> for (i = 0; i < BCH_TIME_STAT_NR; i++)
> - bch2_time_stats_init(&c->times[i]);
> + time_stats_init(&c->times[i]);
>
> bch2_fs_copygc_init(c);
> bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
> @@ -1168,8 +1168,8 @@ static void bch2_dev_free(struct bch_dev *ca)
> bch2_dev_buckets_free(ca);
> free_page((unsigned long) ca->sb_read_scratch);
>
> - bch2_time_stats_exit(&ca->io_latency[WRITE]);
> - bch2_time_stats_exit(&ca->io_latency[READ]);
> + time_stats_exit(&ca->io_latency[WRITE]);
> + time_stats_exit(&ca->io_latency[READ]);
>
> percpu_ref_exit(&ca->io_ref);
> percpu_ref_exit(&ca->ref);
> @@ -1260,8 +1260,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
>
> INIT_WORK(&ca->io_error_work, bch2_io_error_work);
>
> - bch2_time_stats_init(&ca->io_latency[READ]);
> - bch2_time_stats_init(&ca->io_latency[WRITE]);
> + time_stats_init(&ca->io_latency[READ]);
> + time_stats_init(&ca->io_latency[WRITE]);
> ca->io_latency[READ].quantiles_enabled = true;
> ca->io_latency[WRITE].quantiles_enabled = true;
>
> diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
> index f2c8550c1331..a4c37ec66f23 100644
> --- a/fs/bcachefs/util.c
> +++ b/fs/bcachefs/util.c
> @@ -337,32 +337,6 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
> }
> #endif
>
> -static const struct time_unit {
> - const char *name;
> - u64 nsecs;
> -} time_units[] = {
> - { "ns", 1 },
> - { "us", NSEC_PER_USEC },
> - { "ms", NSEC_PER_MSEC },
> - { "s", NSEC_PER_SEC },
> - { "m", (u64) NSEC_PER_SEC * 60},
> - { "h", (u64) NSEC_PER_SEC * 3600},
> - { "eon", U64_MAX },
> -};
> -
> -static const struct time_unit *pick_time_units(u64 ns)
> -{
> - const struct time_unit *u;
> -
> - for (u = time_units;
> - u + 1 < time_units + ARRAY_SIZE(time_units) &&
> - ns >= u[1].nsecs << 1;
> - u++)
> - ;
> -
> - return u;
> -}
> -
> void bch2_pr_time_units(struct printbuf *out, u64 ns)
> {
> const struct time_unit *u = pick_time_units(ns);
> @@ -370,119 +344,7 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
> prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
> }
>
> -/* time stats: */
> -
> -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
> -static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
> -{
> - unsigned i = 0;
> -
> - while (i < ARRAY_SIZE(q->entries)) {
> - struct bch2_quantile_entry *e = q->entries + i;
> -
> - if (unlikely(!e->step)) {
> - e->m = v;
> - e->step = max_t(unsigned, v / 2, 1024);
> - } else if (e->m > v) {
> - e->m = e->m >= e->step
> - ? e->m - e->step
> - : 0;
> - } else if (e->m < v) {
> - e->m = e->m + e->step > e->m
> - ? e->m + e->step
> - : U32_MAX;
> - }
> -
> - if ((e->m > v ? e->m - v : v - e->m) < e->step)
> - e->step = max_t(unsigned, e->step / 2, 1);
> -
> - if (v >= e->m)
> - break;
> -
> - i = eytzinger0_child(i, v > e->m);
> - }
> -}
> -
> -static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
> - u64 start, u64 end)
> -{
> - u64 duration, freq;
> -
> - if (time_after64(end, start)) {
> - duration = end - start;
> - mean_and_variance_update(&stats->duration_stats, duration);
> - mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
> - stats->max_duration = max(stats->max_duration, duration);
> - stats->min_duration = min(stats->min_duration, duration);
> - stats->total_duration += duration;
> - bch2_quantiles_update(&stats->quantiles, duration);
> - }
> -
> - if (time_after64(end, stats->last_event)) {
> - freq = end - stats->last_event;
> - mean_and_variance_update(&stats->freq_stats, freq);
> - mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
> - stats->max_freq = max(stats->max_freq, freq);
> - stats->min_freq = min(stats->min_freq, freq);
> - stats->last_event = end;
> - }
> -}
> -
> -static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
> - struct bch2_time_stat_buffer *b)
> -{
> - for (struct bch2_time_stat_buffer_entry *i = b->entries;
> - i < b->entries + ARRAY_SIZE(b->entries);
> - i++)
> - bch2_time_stats_update_one(stats, i->start, i->end);
> - b->nr = 0;
> -}
> -
> -static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
> - struct bch2_time_stat_buffer *b)
> -{
> - unsigned long flags;
> -
> - spin_lock_irqsave(&stats->lock, flags);
> - __bch2_time_stats_clear_buffer(stats, b);
> - spin_unlock_irqrestore(&stats->lock, flags);
> -}
> -
> -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
> -{
> - unsigned long flags;
> -
> - WARN_ONCE(!stats->duration_stats_weighted.weight ||
> - !stats->freq_stats_weighted.weight,
> - "uninitialized time_stats");
> -
> - if (!stats->buffer) {
> - spin_lock_irqsave(&stats->lock, flags);
> - bch2_time_stats_update_one(stats, start, end);
> -
> - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
> - stats->duration_stats.n > 1024)
> - stats->buffer =
> - alloc_percpu_gfp(struct bch2_time_stat_buffer,
> - GFP_ATOMIC);
> - spin_unlock_irqrestore(&stats->lock, flags);
> - } else {
> - struct bch2_time_stat_buffer *b;
> -
> - preempt_disable();
> - b = this_cpu_ptr(stats->buffer);
> -
> - BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
> - b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
> - .start = start,
> - .end = end
> - };
> -
> - if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
> - bch2_time_stats_clear_buffer(stats, b);
> - preempt_enable();
> - }
> -}
> +/* time stats: printbuf output */
>
> static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
> {
> @@ -503,7 +365,7 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
>
> #define TABSTOP_SIZE 12
>
> -void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
> +void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
> {
> s64 f_mean = 0, d_mean = 0;
> u64 f_stddev = 0, d_stddev = 0;
> @@ -513,7 +375,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
>
> spin_lock_irq(&stats->lock);
> for_each_possible_cpu(cpu)
> - __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
> + __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
> spin_unlock_irq(&stats->lock);
> }
>
> @@ -624,124 +486,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
> }
> }
>
> -#include <linux/seq_buf.h>
> -
> -static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
> -{
> - const struct time_unit *u = pick_time_units(ns);
> -
> - seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
> -}
> -
> -void bch2_time_stats_to_seq_buf(struct seq_buf *out, struct bch2_time_stats *stats)
> -{
> - s64 f_mean = 0, d_mean = 0;
> - u64 f_stddev = 0, d_stddev = 0;
> -
> - if (stats->buffer) {
> - int cpu;
> -
> - spin_lock_irq(&stats->lock);
> - for_each_possible_cpu(cpu)
> - __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
> - spin_unlock_irq(&stats->lock);
> - }
> -
> - /*
> - * avoid divide by zero
> - */
> - if (stats->freq_stats.n) {
> - f_mean = mean_and_variance_get_mean(stats->freq_stats);
> - f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
> - d_mean = mean_and_variance_get_mean(stats->duration_stats);
> - d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
> - }
> -
> - seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
> -
> - seq_buf_printf(out, " since mount recent\n");
> -
> - seq_buf_printf(out, "duration of events\n");
> -
> - seq_buf_printf(out, " min: ");
> - seq_buf_time_units_aligned(out, stats->min_duration);
> - seq_buf_printf(out, "\n");
> -
> - seq_buf_printf(out, " max: ");
> - seq_buf_time_units_aligned(out, stats->max_duration);
> - seq_buf_printf(out, "\n");
> -
> - seq_buf_printf(out, " total: ");
> - seq_buf_time_units_aligned(out, stats->total_duration);
> - seq_buf_printf(out, "\n");
> -
> - seq_buf_printf(out, " mean: ");
> - seq_buf_time_units_aligned(out, d_mean);
> - seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
> - seq_buf_printf(out, "\n");
> -
> - seq_buf_printf(out, " stddev: ");
> - seq_buf_time_units_aligned(out, d_stddev);
> - seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
> - seq_buf_printf(out, "\n");
> -
> - seq_buf_printf(out, "time between events\n");
> -
> - seq_buf_printf(out, " min: ");
> - seq_buf_time_units_aligned(out, stats->min_freq);
> - seq_buf_printf(out, "\n");
> -
> - seq_buf_printf(out, " max: ");
> - seq_buf_time_units_aligned(out, stats->max_freq);
> - seq_buf_printf(out, "\n");
> -
> - seq_buf_printf(out, " mean: ");
> - seq_buf_time_units_aligned(out, f_mean);
> - seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
> - seq_buf_printf(out, "\n");
> -
> - seq_buf_printf(out, " stddev: ");
> - seq_buf_time_units_aligned(out, f_stddev);
> - seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
> - seq_buf_printf(out, "\n");
> -
> - if (stats->quantiles_enabled) {
> - int i = eytzinger0_first(NR_QUANTILES);
> - const struct time_unit *u =
> - pick_time_units(stats->quantiles.entries[i].m);
> - u64 last_q = 0;
> -
> - prt_printf(out, "quantiles (%s):\t", u->name);
> - eytzinger0_for_each(i, NR_QUANTILES) {
> - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
> -
> - u64 q = max(stats->quantiles.entries[i].m, last_q);
> - seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
> - if (is_last)
> - seq_buf_printf(out, "\n");
> - last_q = q;
> - }
> - }
> -}
> -#else
> -void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
> -#endif
> -
> -void bch2_time_stats_exit(struct bch2_time_stats *stats)
> -{
> - free_percpu(stats->buffer);
> -}
> -
> -void bch2_time_stats_init(struct bch2_time_stats *stats)
> -{
> - memset(stats, 0, sizeof(*stats));
> - stats->duration_stats_weighted.weight = 8;
> - stats->freq_stats_weighted.weight = 8;
> - stats->min_duration = U64_MAX;
> - stats->min_freq = U64_MAX;
> - spin_lock_init(&stats->lock);
> -}
> -
> /* ratelimit: */
>
> /**
> diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
> index 7ff2d4fe26f6..cf8d16a91162 100644
> --- a/fs/bcachefs/util.h
> +++ b/fs/bcachefs/util.h
> @@ -15,6 +15,7 @@
> #include <linux/preempt.h>
> #include <linux/ratelimit.h>
> #include <linux/slab.h>
> +#include <linux/time_stats.h>
> #include <linux/vmalloc.h>
> #include <linux/workqueue.h>
> #include <linux/mean_and_variance.h>
> @@ -360,87 +361,7 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
> #endif
> }
>
> -#define NR_QUANTILES 15
> -#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
> -#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
> -#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
> -
> -struct bch2_quantiles {
> - struct bch2_quantile_entry {
> - u64 m;
> - u64 step;
> - } entries[NR_QUANTILES];
> -};
> -
> -struct bch2_time_stat_buffer {
> - unsigned nr;
> - struct bch2_time_stat_buffer_entry {
> - u64 start;
> - u64 end;
> - } entries[32];
> -};
> -
> -struct bch2_time_stats {
> - spinlock_t lock;
> - bool quantiles_enabled;
> - /* all fields are in nanoseconds */
> - u64 min_duration;
> - u64 max_duration;
> - u64 total_duration;
> - u64 max_freq;
> - u64 min_freq;
> - u64 last_event;
> - struct bch2_quantiles quantiles;
> -
> - struct mean_and_variance duration_stats;
> - struct mean_and_variance_weighted duration_stats_weighted;
> - struct mean_and_variance freq_stats;
> - struct mean_and_variance_weighted freq_stats_weighted;
> - struct bch2_time_stat_buffer __percpu *buffer;
> -};
> -
> -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
> -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
> -
> -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
> -{
> - __bch2_time_stats_update(stats, start, local_clock());
> -}
> -
> -static inline bool track_event_change(struct bch2_time_stats *stats,
> - u64 *start, bool v)
> -{
> - if (v != !!*start) {
> - if (!v) {
> - bch2_time_stats_update(stats, *start);
> - *start = 0;
> - } else {
> - *start = local_clock() ?: 1;
> - return true;
> - }
> - }
> -
> - return false;
> -}
> -#else
> -static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
> -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
> -static inline bool track_event_change(struct bch2_time_stats *stats,
> - u64 *start, bool v)
> -{
> - bool ret = v && !*start;
> - *start = v;
> - return ret;
> -}
> -#endif
> -
> -void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
> -
> -struct seq_buf;
> -void bch2_time_stats_to_seq_buf(struct seq_buf *, struct bch2_time_stats *);
> -
> -void bch2_time_stats_exit(struct bch2_time_stats *);
> -void bch2_time_stats_init(struct bch2_time_stats *);
> +void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
>
> #define ewma_add(ewma, val, weight) \
> ({ \
> diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
> new file mode 100644
> index 000000000000..caefa7aba65a
> --- /dev/null
> +++ b/include/linux/time_stats.h
> @@ -0,0 +1,134 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * time_stats - collect statistics on events that have a duration, with nicely
> + * formatted textual output on demand
> + *
> + * - percpu buffering of event collection: cheap enough to shotgun
> + * everywhere without worrying about overhead
> + *
> + * tracks:
> + * - number of events
> + * - maximum event duration ever seen
> + * - sum of all event durations
> + * - average event duration, standard and weighted
> + * - standard deviation of event durations, standard and weighted
> + * and analagous statistics for the frequency of events
> + *
> + * We provide both mean and weighted mean (exponentially weighted), and standard
> + * deviation and weighted standard deviation, to give an efficient-to-compute
> + * view of current behaviour versus. average behaviour - "did this event source
> + * just become wonky, or is this typical?".
> + *
> + * Particularly useful for tracking down latency issues.
> + */
> +#ifndef _LINUX_TIME_STATS_H
> +#define _LINUX_TIME_STATS_H
> +
> +#include <linux/mean_and_variance.h>
> +#include <linux/sched/clock.h>
> +#include <linux/spinlock_types.h>
> +
> +struct time_unit {
> + const char *name;
> + u64 nsecs;
> +};
> +
> +/*
> + * given a nanosecond value, pick the preferred time units for printing:
> + */
> +const struct time_unit *pick_time_units(u64 ns);
> +
> +/*
> + * quantiles - do not use:
> + *
> + * Only enabled if time_stats->quantiles_enabled has been manually set - don't
> + * use in new code.
> + */
> +
> +#define NR_QUANTILES 15
> +#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
> +#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
> +#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
> +
> +struct quantiles {
> + struct quantile_entry {
> + u64 m;
> + u64 step;
> + } entries[NR_QUANTILES];
> +};
> +
> +struct time_stat_buffer {
> + unsigned nr;
> + struct time_stat_buffer_entry {
> + u64 start;
> + u64 end;
> + } entries[32];
> +};
> +
> +struct time_stats {
> + spinlock_t lock;
> + bool quantiles_enabled;
> + /* all fields are in nanoseconds */
> + u64 min_duration;
> + u64 max_duration;
> + u64 total_duration;
> + u64 max_freq;
> + u64 min_freq;
> + u64 last_event;
> + u64 last_event_start;
> + struct quantiles quantiles;
> +
> + struct mean_and_variance duration_stats;
> + struct mean_and_variance_weighted duration_stats_weighted;
> + struct mean_and_variance freq_stats;
> + struct mean_and_variance_weighted freq_stats_weighted;
> + struct time_stat_buffer __percpu *buffer;
> +};
> +
> +void __time_stats_clear_buffer(struct time_stats *, struct time_stat_buffer *);
> +void __time_stats_update(struct time_stats *stats, u64, u64);
> +
> +/**
> + * time_stats_update - collect a new event being tracked
> + *
> + * @stats - time_stats to update
> + * @start - start time of event, recorded with local_clock()
> + *
> + * The end duration of the event will be the current time
> + */
> +static inline void time_stats_update(struct time_stats *stats, u64 start)
> +{
> + __time_stats_update(stats, start, local_clock());
> +}
> +
> +/**
> + * track_event_change - track state change events
> + *
> + * @stats - time_stats to update
> + * @v - new state, true or false
> + *
> + * Use this when tracking time stats for state changes, i.e. resource X becoming
> + * blocked/unblocked.
> + */
> +static inline bool track_event_change(struct time_stats *stats, bool v)
> +{
> + if (v != !!stats->last_event_start) {
> + if (!v) {
> + time_stats_update(stats, stats->last_event_start);
> + stats->last_event_start = 0;
> + } else {
> + stats->last_event_start = local_clock() ?: 1;
> + return true;
> + }
> + }
> +
> + return false;
> +}
> +
> +struct seq_buf;
> +void time_stats_to_seq_buf(struct seq_buf *, struct time_stats *);
> +
> +void time_stats_exit(struct time_stats *);
> +void time_stats_init(struct time_stats *);
> +
> +#endif /* _LINUX_TIME_STATS_H */
> diff --git a/lib/Kconfig b/lib/Kconfig
> index 5ddda7c2ed9b..3ba8b965f8c7 100644
> --- a/lib/Kconfig
> +++ b/lib/Kconfig
> @@ -785,3 +785,7 @@ config POLYNOMIAL
>
> config FIRMWARE_TABLE
> bool
> +
> +config TIME_STATS
> + tristate
> + select MEAN_AND_VARIANCE
> diff --git a/lib/Makefile b/lib/Makefile
> index 6b09731d8e61..57858997c87a 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -370,6 +370,8 @@ obj-$(CONFIG_SBITMAP) += sbitmap.o
>
> obj-$(CONFIG_PARMAN) += parman.o
>
> +obj-$(CONFIG_TIME_STATS) += time_stats.o
> +
> obj-y += group_cpus.o
>
> # GCC library routines
> diff --git a/lib/time_stats.c b/lib/time_stats.c
> new file mode 100644
> index 000000000000..a5b9f149e2c6
> --- /dev/null
> +++ b/lib/time_stats.c
> @@ -0,0 +1,264 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/eytzinger.h>
> +#include <linux/jiffies.h>
> +#include <linux/percpu.h>
> +#include <linux/time.h>
> +#include <linux/time_stats.h>
> +#include <linux/spinlock.h>
> +
> +static const struct time_unit time_units[] = {
> + { "ns", 1 },
> + { "us", NSEC_PER_USEC },
> + { "ms", NSEC_PER_MSEC },
> + { "s", NSEC_PER_SEC },
> + { "m", (u64) NSEC_PER_SEC * 60},
> + { "h", (u64) NSEC_PER_SEC * 3600},
> + { "eon", U64_MAX },
> +};
> +
> +const struct time_unit *pick_time_units(u64 ns)
> +{
> + const struct time_unit *u;
> +
> + for (u = time_units;
> + u + 1 < time_units + ARRAY_SIZE(time_units) &&
> + ns >= u[1].nsecs << 1;
> + u++)
> + ;
> +
> + return u;
> +}
> +
> +static void quantiles_update(struct quantiles *q, u64 v)
> +{
> + unsigned i = 0;
> +
> + while (i < ARRAY_SIZE(q->entries)) {
> + struct quantile_entry *e = q->entries + i;
> +
> + if (unlikely(!e->step)) {
> + e->m = v;
> + e->step = max_t(unsigned, v / 2, 1024);
> + } else if (e->m > v) {
> + e->m = e->m >= e->step
> + ? e->m - e->step
> + : 0;
> + } else if (e->m < v) {
> + e->m = e->m + e->step > e->m
> + ? e->m + e->step
> + : U32_MAX;
> + }
> +
> + if ((e->m > v ? e->m - v : v - e->m) < e->step)
> + e->step = max_t(unsigned, e->step / 2, 1);
> +
> + if (v >= e->m)
> + break;
> +
> + i = eytzinger0_child(i, v > e->m);
> + }
> +}
> +
> +static inline void time_stats_update_one(struct time_stats *stats,
> + u64 start, u64 end)
> +{
> + u64 duration, freq;
> +
> + if (time_after64(end, start)) {
> + duration = end - start;
> + mean_and_variance_update(&stats->duration_stats, duration);
> + mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
> + stats->max_duration = max(stats->max_duration, duration);
> + stats->min_duration = min(stats->min_duration, duration);
> + stats->total_duration += duration;
> +
> + if (stats->quantiles_enabled)
> + quantiles_update(&stats->quantiles, duration);
> + }
> +
> + if (time_after64(end, stats->last_event)) {
> + freq = end - stats->last_event;
> + mean_and_variance_update(&stats->freq_stats, freq);
> + mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
> + stats->max_freq = max(stats->max_freq, freq);
> + stats->min_freq = min(stats->min_freq, freq);
> + stats->last_event = end;
> + }
> +}
> +
> +void __time_stats_clear_buffer(struct time_stats *stats,
> + struct time_stat_buffer *b)
> +{
> + for (struct time_stat_buffer_entry *i = b->entries;
> + i < b->entries + ARRAY_SIZE(b->entries);
> + i++)
> + time_stats_update_one(stats, i->start, i->end);
> + b->nr = 0;
> +}
> +EXPORT_SYMBOL_GPL(__time_stats_clear_buffer);
> +
> +static noinline void time_stats_clear_buffer(struct time_stats *stats,
> + struct time_stat_buffer *b)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&stats->lock, flags);
> + __time_stats_clear_buffer(stats, b);
> + spin_unlock_irqrestore(&stats->lock, flags);
> +}
> +
> +void __time_stats_update(struct time_stats *stats, u64 start, u64 end)
> +{
> + unsigned long flags;
> +
> + WARN_ONCE(!stats->duration_stats_weighted.weight ||
> + !stats->freq_stats_weighted.weight,
> + "uninitialized time_stats");
> +
> + if (!stats->buffer) {
> + spin_lock_irqsave(&stats->lock, flags);
> + time_stats_update_one(stats, start, end);
> +
> + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
> + stats->duration_stats.n > 1024)
> + stats->buffer =
> + alloc_percpu_gfp(struct time_stat_buffer,
> + GFP_ATOMIC);
> + spin_unlock_irqrestore(&stats->lock, flags);
> + } else {
> + struct time_stat_buffer *b;
> +
> + preempt_disable();
> + b = this_cpu_ptr(stats->buffer);
> +
> + BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
> + b->entries[b->nr++] = (struct time_stat_buffer_entry) {
> + .start = start,
> + .end = end
> + };
> +
> + if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
> + time_stats_clear_buffer(stats, b);
> + preempt_enable();
> + }
> +}
> +EXPORT_SYMBOL_GPL(__time_stats_update);
> +
> +#include <linux/seq_buf.h>
> +
> +static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
> +{
> + const struct time_unit *u = pick_time_units(ns);
> +
> + seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
> +}
> +
> +void time_stats_to_seq_buf(struct seq_buf *out, struct time_stats *stats)
> +{
> + s64 f_mean = 0, d_mean = 0;
> + u64 f_stddev = 0, d_stddev = 0;
> +
> + if (stats->buffer) {
> + int cpu;
> +
> + spin_lock_irq(&stats->lock);
> + for_each_possible_cpu(cpu)
> + __time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
> + spin_unlock_irq(&stats->lock);
> + }
> +
> + /*
> + * avoid divide by zero
> + */
> + if (stats->freq_stats.n) {
> + f_mean = mean_and_variance_get_mean(stats->freq_stats);
> + f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
> + d_mean = mean_and_variance_get_mean(stats->duration_stats);
> + d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
> + }
> +
> + seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
> +
> + seq_buf_printf(out, " since mount recent\n");
> +
> + seq_buf_printf(out, "duration of events\n");
> +
> + seq_buf_printf(out, " min: ");
> + seq_buf_time_units_aligned(out, stats->min_duration);
> + seq_buf_printf(out, "\n");
> +
> + seq_buf_printf(out, " max: ");
> + seq_buf_time_units_aligned(out, stats->max_duration);
> + seq_buf_printf(out, "\n");
> +
> + seq_buf_printf(out, " total: ");
> + seq_buf_time_units_aligned(out, stats->total_duration);
> + seq_buf_printf(out, "\n");
> +
> + seq_buf_printf(out, " mean: ");
> + seq_buf_time_units_aligned(out, d_mean);
> + seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
> + seq_buf_printf(out, "\n");
> +
> + seq_buf_printf(out, " stddev: ");
> + seq_buf_time_units_aligned(out, d_stddev);
> + seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
> + seq_buf_printf(out, "\n");
> +
> + seq_buf_printf(out, "time between events\n");
> +
> + seq_buf_printf(out, " min: ");
> + seq_buf_time_units_aligned(out, stats->min_freq);
> + seq_buf_printf(out, "\n");
> +
> + seq_buf_printf(out, " max: ");
> + seq_buf_time_units_aligned(out, stats->max_freq);
> + seq_buf_printf(out, "\n");
> +
> + seq_buf_printf(out, " mean: ");
> + seq_buf_time_units_aligned(out, f_mean);
> + seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
> + seq_buf_printf(out, "\n");
> +
> + seq_buf_printf(out, " stddev: ");
> + seq_buf_time_units_aligned(out, f_stddev);
> + seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
> + seq_buf_printf(out, "\n");
> +
> + if (stats->quantiles_enabled) {
> + int i = eytzinger0_first(NR_QUANTILES);
> + const struct time_unit *u =
> + pick_time_units(stats->quantiles.entries[i].m);
> + u64 last_q = 0;
> +
> + seq_buf_printf(out, "quantiles (%s):\t", u->name);
> + eytzinger0_for_each(i, NR_QUANTILES) {
> + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
> +
> + u64 q = max(stats->quantiles.entries[i].m, last_q);
> + seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
> + if (is_last)
> + seq_buf_printf(out, "\n");
> + last_q = q;
> + }
> + }
> +}
> +EXPORT_SYMBOL_GPL(time_stats_to_seq_buf);
> +
> +void time_stats_exit(struct time_stats *stats)
> +{
> + free_percpu(stats->buffer);
> +}
> +EXPORT_SYMBOL_GPL(time_stats_exit);
> +
> +void time_stats_init(struct time_stats *stats)
> +{
> + memset(stats, 0, sizeof(*stats));
> + stats->duration_stats_weighted.weight = 8;
> + stats->freq_stats_weighted.weight = 8;
> + stats->min_duration = U64_MAX;
> + stats->min_freq = U64_MAX;
> + spin_lock_init(&stats->lock);
> +}
> +EXPORT_SYMBOL_GPL(time_stats_init);
> --
> 2.43.0
>
>

2024-01-27 02:18:14

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 2/5] eytzinger: Promote to include/linux/

On Fri, Jan 26, 2024 at 05:06:52PM -0500, Kent Overstreet wrote:
> eytzinger trees are a faster alternative to binary search. They're a bit
> more expensive to setup, but lookups perform much better assuming the
> tree isn't entirely in cache.
>
> Binary search is a worst case scenario for branch prediction and
> prefetching, but eytzinger trees have children adjacent in memory and
> thus we can prefetch before knowing the result of a comparison.
>
> An eytzinger tree is a binary tree laid out in an array, with the same
> geometry as the usual binary heap construction, but used as a search
> tree instead.
>
> Signed-off-by: Kent Overstreet <[email protected]>

This looks more or less like what I remember of building heaps and
squinting at my horrible handwritten notes about eytzinger trees from
back in the day.

Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/bcachefs/bset.c | 2 +-
> fs/bcachefs/journal_seq_blacklist.c | 6 +-
> fs/bcachefs/replicas.c | 17 ++-
> fs/bcachefs/replicas.h | 3 +-
> fs/bcachefs/super-io.h | 2 +-
> fs/bcachefs/util.c | 145 +--------------------
> fs/bcachefs/util.h | 4 -
> {fs/bcachefs => include/linux}/eytzinger.h | 56 ++++----
> lib/sort.c | 85 ++++++++++++
> 9 files changed, 136 insertions(+), 184 deletions(-)
> rename {fs/bcachefs => include/linux}/eytzinger.h (78%)
>
> diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
> index 3fd1085b6c61..1d77aa55d641 100644
> --- a/fs/bcachefs/bset.c
> +++ b/fs/bcachefs/bset.c
> @@ -9,12 +9,12 @@
> #include "bcachefs.h"
> #include "btree_cache.h"
> #include "bset.h"
> -#include "eytzinger.h"
> #include "trace.h"
> #include "util.h"
>
> #include <asm/unaligned.h>
> #include <linux/console.h>
> +#include <linux/eytzinger.h>
> #include <linux/random.h>
> #include <linux/prefetch.h>
>
> diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
> index 0200e299cfbb..024c9b1b323f 100644
> --- a/fs/bcachefs/journal_seq_blacklist.c
> +++ b/fs/bcachefs/journal_seq_blacklist.c
> @@ -2,10 +2,11 @@
>
> #include "bcachefs.h"
> #include "btree_iter.h"
> -#include "eytzinger.h"
> #include "journal_seq_blacklist.h"
> #include "super-io.h"
>
> +#include <linux/eytzinger.h>
> +
> /*
> * journal_seq_blacklist machinery:
> *
> @@ -119,8 +120,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
> return ret ?: bch2_blacklist_table_initialize(c);
> }
>
> -static int journal_seq_blacklist_table_cmp(const void *_l,
> - const void *_r, size_t size)
> +static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
> {
> const struct journal_seq_blacklist_table_entry *l = _l;
> const struct journal_seq_blacklist_table_entry *r = _r;
> diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
> index cc2672c12031..75fdce373f76 100644
> --- a/fs/bcachefs/replicas.c
> +++ b/fs/bcachefs/replicas.c
> @@ -6,12 +6,15 @@
> #include "replicas.h"
> #include "super-io.h"
>
> +#include <linux/sort.h>
> +
> static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
> struct bch_replicas_cpu *);
>
> /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
> -static int bch2_memcmp(const void *l, const void *r, size_t size)
> +static int bch2_memcmp(const void *l, const void *r, const void *priv)
> {
> + size_t size = (size_t) priv;
> return memcmp(l, r, size);
> }
>
> @@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
>
> static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
> {
> - eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
> + eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
> + bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
> }
>
> static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
> @@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
> {
> unsigned i;
>
> - sort_cmp_size(cpu_r->entries,
> - cpu_r->nr,
> - cpu_r->entry_size,
> - bch2_memcmp, NULL);
> + sort_r(cpu_r->entries,
> + cpu_r->nr,
> + cpu_r->entry_size,
> + bch2_memcmp, NULL,
> + (void *)(size_t)cpu_r->entry_size);
>
> for (i = 0; i < cpu_r->nr; i++) {
> struct bch_replicas_entry_v1 *e =
> diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
> index 654a4b26d3a3..983cce782ac2 100644
> --- a/fs/bcachefs/replicas.h
> +++ b/fs/bcachefs/replicas.h
> @@ -3,9 +3,10 @@
> #define _BCACHEFS_REPLICAS_H
>
> #include "bkey.h"
> -#include "eytzinger.h"
> #include "replicas_types.h"
>
> +#include <linux/eytzinger.h>
> +
> void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
> void bch2_replicas_entry_to_text(struct printbuf *,
> struct bch_replicas_entry_v1 *);
> diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
> index 95e80e06316b..f37620919e11 100644
> --- a/fs/bcachefs/super-io.h
> +++ b/fs/bcachefs/super-io.h
> @@ -3,12 +3,12 @@
> #define _BCACHEFS_SUPER_IO_H
>
> #include "extents.h"
> -#include "eytzinger.h"
> #include "super_types.h"
> #include "super.h"
> #include "sb-members.h"
>
> #include <asm/byteorder.h>
> +#include <linux/eytzinger.h>
>
> static inline bool bch2_version_compatible(u16 version)
> {
> diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
> index d7ea95abb9df..c7cf9c6fcf9a 100644
> --- a/fs/bcachefs/util.c
> +++ b/fs/bcachefs/util.c
> @@ -11,6 +11,7 @@
> #include <linux/console.h>
> #include <linux/ctype.h>
> #include <linux/debugfs.h>
> +#include <linux/eytzinger.h>
> #include <linux/freezer.h>
> #include <linux/kthread.h>
> #include <linux/log2.h>
> @@ -24,7 +25,6 @@
> #include <linux/sched/clock.h>
> #include <linux/mean_and_variance.h>
>
> -#include "eytzinger.h"
> #include "util.h"
>
> static const char si_units[] = "?kMGTPEZY";
> @@ -863,149 +863,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
> }
> }
>
> -static int alignment_ok(const void *base, size_t align)
> -{
> - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
> - ((unsigned long)base & (align - 1)) == 0;
> -}
> -
> -static void u32_swap(void *a, void *b, size_t size)
> -{
> - u32 t = *(u32 *)a;
> - *(u32 *)a = *(u32 *)b;
> - *(u32 *)b = t;
> -}
> -
> -static void u64_swap(void *a, void *b, size_t size)
> -{
> - u64 t = *(u64 *)a;
> - *(u64 *)a = *(u64 *)b;
> - *(u64 *)b = t;
> -}
> -
> -static void generic_swap(void *a, void *b, size_t size)
> -{
> - char t;
> -
> - do {
> - t = *(char *)a;
> - *(char *)a++ = *(char *)b;
> - *(char *)b++ = t;
> - } while (--size > 0);
> -}
> -
> -static inline int do_cmp(void *base, size_t n, size_t size,
> - int (*cmp_func)(const void *, const void *, size_t),
> - size_t l, size_t r)
> -{
> - return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
> - base + inorder_to_eytzinger0(r, n) * size,
> - size);
> -}
> -
> -static inline void do_swap(void *base, size_t n, size_t size,
> - void (*swap_func)(void *, void *, size_t),
> - size_t l, size_t r)
> -{
> - swap_func(base + inorder_to_eytzinger0(l, n) * size,
> - base + inorder_to_eytzinger0(r, n) * size,
> - size);
> -}
> -
> -void eytzinger0_sort(void *base, size_t n, size_t size,
> - int (*cmp_func)(const void *, const void *, size_t),
> - void (*swap_func)(void *, void *, size_t))
> -{
> - int i, c, r;
> -
> - if (!swap_func) {
> - if (size == 4 && alignment_ok(base, 4))
> - swap_func = u32_swap;
> - else if (size == 8 && alignment_ok(base, 8))
> - swap_func = u64_swap;
> - else
> - swap_func = generic_swap;
> - }
> -
> - /* heapify */
> - for (i = n / 2 - 1; i >= 0; --i) {
> - for (r = i; r * 2 + 1 < n; r = c) {
> - c = r * 2 + 1;
> -
> - if (c + 1 < n &&
> - do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
> - c++;
> -
> - if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
> - break;
> -
> - do_swap(base, n, size, swap_func, r, c);
> - }
> - }
> -
> - /* sort */
> - for (i = n - 1; i > 0; --i) {
> - do_swap(base, n, size, swap_func, 0, i);
> -
> - for (r = 0; r * 2 + 1 < i; r = c) {
> - c = r * 2 + 1;
> -
> - if (c + 1 < i &&
> - do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
> - c++;
> -
> - if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
> - break;
> -
> - do_swap(base, n, size, swap_func, r, c);
> - }
> - }
> -}
> -
> -void sort_cmp_size(void *base, size_t num, size_t size,
> - int (*cmp_func)(const void *, const void *, size_t),
> - void (*swap_func)(void *, void *, size_t size))
> -{
> - /* pre-scale counters for performance */
> - int i = (num/2 - 1) * size, n = num * size, c, r;
> -
> - if (!swap_func) {
> - if (size == 4 && alignment_ok(base, 4))
> - swap_func = u32_swap;
> - else if (size == 8 && alignment_ok(base, 8))
> - swap_func = u64_swap;
> - else
> - swap_func = generic_swap;
> - }
> -
> - /* heapify */
> - for ( ; i >= 0; i -= size) {
> - for (r = i; r * 2 + size < n; r = c) {
> - c = r * 2 + size;
> - if (c < n - size &&
> - cmp_func(base + c, base + c + size, size) < 0)
> - c += size;
> - if (cmp_func(base + r, base + c, size) >= 0)
> - break;
> - swap_func(base + r, base + c, size);
> - }
> - }
> -
> - /* sort */
> - for (i = n - size; i > 0; i -= size) {
> - swap_func(base, base + i, size);
> - for (r = 0; r * 2 + size < i; r = c) {
> - c = r * 2 + size;
> - if (c < i - size &&
> - cmp_func(base + c, base + c + size, size) < 0)
> - c += size;
> - if (cmp_func(base + r, base + c, size) >= 0)
> - break;
> - swap_func(base + r, base + c, size);
> - }
> - }
> -}
> -
> static void mempool_free_vp(void *element, void *pool_data)
> {
> size_t size = (size_t) pool_data;
> diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
> index 0059481995ef..c3b11c3d24ea 100644
> --- a/fs/bcachefs/util.h
> +++ b/fs/bcachefs/util.h
> @@ -737,10 +737,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
> memset(s + bytes, c, rem);
> }
>
> -void sort_cmp_size(void *base, size_t num, size_t size,
> - int (*cmp_func)(const void *, const void *, size_t),
> - void (*swap_func)(void *, void *, size_t));
> -
> /* just the memmove, doesn't update @_nr */
> #define __array_insert_item(_array, _nr, _pos) \
> memmove(&(_array)[(_pos) + 1], \
> diff --git a/fs/bcachefs/eytzinger.h b/include/linux/eytzinger.h
> similarity index 78%
> rename from fs/bcachefs/eytzinger.h
> rename to include/linux/eytzinger.h
> index b04750dbf870..9565a5c26cd5 100644
> --- a/fs/bcachefs/eytzinger.h
> +++ b/include/linux/eytzinger.h
> @@ -1,27 +1,37 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> -#ifndef _EYTZINGER_H
> -#define _EYTZINGER_H
> +#ifndef _LINUX_EYTZINGER_H
> +#define _LINUX_EYTZINGER_H
>
> #include <linux/bitops.h>
> #include <linux/log2.h>
>
> -#include "util.h"
> +#ifdef EYTZINGER_DEBUG
> +#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
> +#else
> +#define EYTZINGER_BUG_ON(cond)
> +#endif
>
> /*
> * Traversal for trees in eytzinger layout - a full binary tree layed out in an
> - * array
> - */
> -
> -/*
> - * One based indexing version:
> + * array.
> *
> - * With one based indexing each level of the tree starts at a power of two -
> - * good for cacheline alignment:
> + * Consider using an eytzinger tree any time you would otherwise be doing binary
> + * search over an array. Binary search is a worst case scenario for branch
> + * prediction and prefetching, but in an eytzinger tree every node's children
> + * are adjacent in memory, thus we can prefetch children before knowing the
> + * result of the comparison, assuming multiple nodes fit on a cacheline.
> + *
> + * Two variants are provided, for one based indexing and zero based indexing.
> + *
> + * Zero based indexing is more convenient, but one based indexing has better
> + * alignment and thus better performance because each new level of the tree
> + * starts at a power of two, and thus if element 0 was cacheline aligned, each
> + * new level will be as well.
> */
>
> static inline unsigned eytzinger1_child(unsigned i, unsigned child)
> {
> - EBUG_ON(child > 1);
> + EYTZINGER_BUG_ON(child > 1);
>
> return (i << 1) + child;
> }
> @@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
>
> static inline unsigned eytzinger1_next(unsigned i, unsigned size)
> {
> - EBUG_ON(i > size);
> + EYTZINGER_BUG_ON(i > size);
>
> if (eytzinger1_right_child(i) <= size) {
> i = eytzinger1_right_child(i);
> @@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
>
> static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
> {
> - EBUG_ON(i > size);
> + EYTZINGER_BUG_ON(i > size);
>
> if (eytzinger1_left_child(i) <= size) {
> i = eytzinger1_left_child(i) + 1;
> @@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
> unsigned shift = __fls(size) - b;
> int s;
>
> - EBUG_ON(!i || i > size);
> + EYTZINGER_BUG_ON(!i || i > size);
>
> i ^= 1U << b;
> i <<= 1;
> @@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
> unsigned shift;
> int s;
>
> - EBUG_ON(!i || i > size);
> + EYTZINGER_BUG_ON(!i || i > size);
>
> /*
> * sign bit trick:
> @@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
>
> static inline unsigned eytzinger0_child(unsigned i, unsigned child)
> {
> - EBUG_ON(child > 1);
> + EYTZINGER_BUG_ON(child > 1);
>
> return (i << 1) + 1 + child;
> }
> @@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
> (_i) != -1; \
> (_i) = eytzinger0_next((_i), (_size)))
>
> -typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
> -
> /* return greatest node <= @search, or -1 if not found */
> static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
> - eytzinger_cmp_fn cmp, const void *search)
> + cmp_func_t cmp, const void *search)
> {
> unsigned i, n = 0;
>
> @@ -244,7 +252,7 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
>
> do {
> i = n;
> - n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
> + n = eytzinger0_child(i, cmp(search, base + i * size) >= 0);
> } while (n < nr);
>
> if (n & 1) {
> @@ -274,8 +282,8 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
> _i; \
> })
>
> -void eytzinger0_sort(void *, size_t, size_t,
> - int (*cmp_func)(const void *, const void *, size_t),
> - void (*swap_func)(void *, void *, size_t));
> +void eytzinger0_sort_r(void *, size_t, size_t,
> + cmp_r_func_t, swap_r_func_t, const void *);
> +void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
>
> -#endif /* _EYTZINGER_H */
> +#endif /* _LINUX_EYTZINGER_H */
> diff --git a/lib/sort.c b/lib/sort.c
> index b399bf10d675..3dfa83d86bbb 100644
> --- a/lib/sort.c
> +++ b/lib/sort.c
> @@ -290,3 +290,88 @@ void sort(void *base, size_t num, size_t size,
> return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
> }
> EXPORT_SYMBOL(sort);
> +
> +#include <linux/eytzinger.h>
> +
> +static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
> + cmp_r_func_t cmp_func, const void *priv,
> + size_t l, size_t r)
> +{
> + return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
> + base + inorder_to_eytzinger0(r, n) * size,
> + cmp_func, priv);
> +}
> +
> +static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
> + swap_r_func_t swap_func, const void *priv,
> + size_t l, size_t r)
> +{
> + do_swap(base + inorder_to_eytzinger0(l, n) * size,
> + base + inorder_to_eytzinger0(r, n) * size,
> + size, swap_func, priv);
> +}
> +
> +void eytzinger0_sort_r(void *base, size_t n, size_t size,
> + cmp_r_func_t cmp_func,
> + swap_r_func_t swap_func,
> + const void *priv)
> +{
> + int i, c, r;
> +
> + if (!swap_func) {
> + if (is_aligned(base, size, 8))
> + swap_func = SWAP_WORDS_64;
> + else if (is_aligned(base, size, 4))
> + swap_func = SWAP_WORDS_32;
> + else
> + swap_func = SWAP_BYTES;
> + }
> +
> + /* heapify */
> + for (i = n / 2 - 1; i >= 0; --i) {
> + for (r = i; r * 2 + 1 < n; r = c) {
> + c = r * 2 + 1;
> +
> + if (c + 1 < n &&
> + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
> + c++;
> +
> + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
> + break;
> +
> + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
> + }
> + }
> +
> + /* sort */
> + for (i = n - 1; i > 0; --i) {
> + eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
> +
> + for (r = 0; r * 2 + 1 < i; r = c) {
> + c = r * 2 + 1;
> +
> + if (c + 1 < i &&
> + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
> + c++;
> +
> + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
> + break;
> +
> + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
> + }
> + }
> +}
> +EXPORT_SYMBOL_GPL(eytzinger0_sort_r);
> +
> +void eytzinger0_sort(void *base, size_t n, size_t size,
> + cmp_func_t cmp_func,
> + swap_func_t swap_func)
> +{
> + struct wrapper w = {
> + .cmp = cmp_func,
> + .swap = swap_func,
> + };
> +
> + return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
> +}
> +EXPORT_SYMBOL_GPL(eytzinger0_sort);
> --
> 2.43.0
>
>

2024-01-27 07:09:52

by Coly Li

[permalink] [raw]
Subject: Re: [PATCH 5/5] bcache: Convert to lib/time_stats

On Fri, Jan 26, 2024 at 05:06:55PM -0500, Kent Overstreet wrote:
> delete bcache's time stats code, convert to newer version from bcachefs.
>
> example output:
>
> root@moria-kvm:/sys/fs/bcache/bdaedb8c-4554-4dd2-87e4-276e51eb47cc# cat internal/btree_sort_times
> count: 6414
> since mount recent
> duration of events
> min: 440 ns
> max: 1102 us
> total: 674 ms
> mean: 105 us 102 us
> stddev: 101 us 88 us
> time between events
> min: 881 ns
> max: 3 s
> mean: 7 ms 6 ms
> stddev: 52 ms 6 ms
>
> Cc: Coly Li <[email protected]>
> Cc: [email protected]
> Signed-off-by: Kent Overstreet <[email protected]>

Acked-by: Coly Li <[email protected]>

Thanks.


Coly Li

> ---
> drivers/md/bcache/Kconfig | 1 +
> drivers/md/bcache/bcache.h | 1 +
> drivers/md/bcache/bset.c | 6 +++--
> drivers/md/bcache/bset.h | 1 +
> drivers/md/bcache/btree.c | 6 ++---
> drivers/md/bcache/super.c | 7 +++++
> drivers/md/bcache/sysfs.c | 25 +++++++++---------
> drivers/md/bcache/util.c | 30 ----------------------
> drivers/md/bcache/util.h | 52 +++++---------------------------------
> 9 files changed, 37 insertions(+), 92 deletions(-)
>
> diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
> index b2d10063d35f..7ea057983d3d 100644
> --- a/drivers/md/bcache/Kconfig
> +++ b/drivers/md/bcache/Kconfig
> @@ -5,6 +5,7 @@ config BCACHE
> select BLOCK_HOLDER_DEPRECATED if SYSFS
> select CRC64
> select CLOSURES
> + select TIME_STATS
> help
> Allows a block device to be used as cache for other devices; uses
> a btree for indexing and the layout is optimized for SSDs.
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 6ae2329052c9..76e7b494c394 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -186,6 +186,7 @@
> #include <linux/rbtree.h>
> #include <linux/rwsem.h>
> #include <linux/refcount.h>
> +#include <linux/time_stats.h>
> #include <linux/types.h>
> #include <linux/workqueue.h>
> #include <linux/kthread.h>
> diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
> index 2bba4d6aaaa2..31c08d4ab83b 100644
> --- a/drivers/md/bcache/bset.c
> +++ b/drivers/md/bcache/bset.c
> @@ -1177,6 +1177,7 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
>
> void bch_bset_sort_state_free(struct bset_sort_state *state)
> {
> + time_stats_exit(&state->time);
> mempool_exit(&state->pool);
> }
>
> @@ -1184,6 +1185,7 @@ int bch_bset_sort_state_init(struct bset_sort_state *state,
> unsigned int page_order)
> {
> spin_lock_init(&state->time.lock);
> + time_stats_init(&state->time);
>
> state->page_order = page_order;
> state->crit_factor = int_sqrt(1 << page_order);
> @@ -1286,7 +1288,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
> bch_bset_build_written_tree(b);
>
> if (!start)
> - bch_time_stats_update(&state->time, start_time);
> + time_stats_update(&state->time, start_time);
> }
>
> void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
> @@ -1329,7 +1331,7 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
>
> btree_mergesort(b, new->set->data, &iter, false, true);
>
> - bch_time_stats_update(&state->time, start_time);
> + time_stats_update(&state->time, start_time);
>
> new->set->size = 0; // XXX: why?
> }
> diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
> index d795c84246b0..13e524ad7783 100644
> --- a/drivers/md/bcache/bset.h
> +++ b/drivers/md/bcache/bset.h
> @@ -3,6 +3,7 @@
> #define _BCACHE_BSET_H
>
> #include <linux/kernel.h>
> +#include <linux/time_stats.h>
> #include <linux/types.h>
>
> #include "bcache_ondisk.h"
> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> index 196cdacce38f..0ed337c5f0dc 100644
> --- a/drivers/md/bcache/btree.c
> +++ b/drivers/md/bcache/btree.c
> @@ -270,7 +270,7 @@ static void bch_btree_node_read(struct btree *b)
> goto err;
>
> bch_btree_node_read_done(b);
> - bch_time_stats_update(&b->c->btree_read_time, start_time);
> + time_stats_update(&b->c->btree_read_time, start_time);
>
> return;
> err:
> @@ -1852,7 +1852,7 @@ static void bch_btree_gc(struct cache_set *c)
> bch_btree_gc_finish(c);
> wake_up_allocators(c);
>
> - bch_time_stats_update(&c->btree_gc_time, start_time);
> + time_stats_update(&c->btree_gc_time, start_time);
>
> stats.key_bytes *= sizeof(uint64_t);
> stats.data <<= 9;
> @@ -2343,7 +2343,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
> btree_node_free(b);
> rw_unlock(true, n1);
>
> - bch_time_stats_update(&b->c->btree_split_time, start_time);
> + time_stats_update(&b->c->btree_split_time, start_time);
>
> return 0;
> err_free2:
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index dc3f50f69714..625e4883299c 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -1676,6 +1676,9 @@ static CLOSURE_CALLBACK(cache_set_free)
>
> debugfs_remove(c->debug);
>
> + time_stats_exit(&c->btree_read_time);
> + time_stats_exit(&c->btree_split_time);
> + time_stats_exit(&c->btree_gc_time);
> bch_open_buckets_free(c);
> bch_btree_cache_free(c);
> bch_journal_free(c);
> @@ -1913,6 +1916,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
> INIT_LIST_HEAD(&c->btree_cache_freed);
> INIT_LIST_HEAD(&c->data_buckets);
>
> + time_stats_init(&c->btree_gc_time);
> + time_stats_init(&c->btree_split_time);
> + time_stats_init(&c->btree_read_time);
> +
> iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
> sizeof(struct btree_iter_set);
>
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index a438efb66069..01cc5c632f08 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -14,6 +14,7 @@
> #include "features.h"
>
> #include <linux/blkdev.h>
> +#include <linux/seq_buf.h>
> #include <linux/sort.h>
> #include <linux/sched/clock.h>
>
> @@ -79,10 +80,10 @@ read_attribute(active_journal_entries);
> read_attribute(backing_dev_name);
> read_attribute(backing_dev_uuid);
>
> -sysfs_time_stats_attribute(btree_gc, sec, ms);
> -sysfs_time_stats_attribute(btree_split, sec, us);
> -sysfs_time_stats_attribute(btree_sort, ms, us);
> -sysfs_time_stats_attribute(btree_read, ms, us);
> +read_attribute(btree_gc_times);
> +read_attribute(btree_split_times);
> +read_attribute(btree_sort_times);
> +read_attribute(btree_read_times);
>
> read_attribute(btree_nodes);
> read_attribute(btree_used_percent);
> @@ -743,10 +744,10 @@ SHOW(__bch_cache_set)
> sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c));
> sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);
>
> - sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
> - sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
> - sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
> - sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
> + sysfs_print_time_stats(&c->btree_gc_time, btree_gc_times);
> + sysfs_print_time_stats(&c->btree_split_time, btree_split_times);
> + sysfs_print_time_stats(&c->sort.time, btree_sort_times);
> + sysfs_print_time_stats(&c->btree_read_time, btree_read_times);
>
> sysfs_print(btree_used_percent, bch_btree_used(c));
> sysfs_print(btree_nodes, c->gc_stats.nodes);
> @@ -989,10 +990,10 @@ KTYPE(bch_cache_set);
> static struct attribute *bch_cache_set_internal_attrs[] = {
> &sysfs_active_journal_entries,
>
> - sysfs_time_stats_attribute_list(btree_gc, sec, ms)
> - sysfs_time_stats_attribute_list(btree_split, sec, us)
> - sysfs_time_stats_attribute_list(btree_sort, ms, us)
> - sysfs_time_stats_attribute_list(btree_read, ms, us)
> + &sysfs_btree_gc_times,
> + &sysfs_btree_split_times,
> + &sysfs_btree_sort_times,
> + &sysfs_btree_read_times,
>
> &sysfs_btree_nodes,
> &sysfs_btree_used_percent,
> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
> index ae380bc3992e..95282bf0f9a7 100644
> --- a/drivers/md/bcache/util.c
> +++ b/drivers/md/bcache/util.c
> @@ -160,36 +160,6 @@ int bch_parse_uuid(const char *s, char *uuid)
> return i;
> }
>
> -void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
> -{
> - uint64_t now, duration, last;
> -
> - spin_lock(&stats->lock);
> -
> - now = local_clock();
> - duration = time_after64(now, start_time)
> - ? now - start_time : 0;
> - last = time_after64(now, stats->last)
> - ? now - stats->last : 0;
> -
> - stats->max_duration = max(stats->max_duration, duration);
> -
> - if (stats->last) {
> - ewma_add(stats->average_duration, duration, 8, 8);
> -
> - if (stats->average_frequency)
> - ewma_add(stats->average_frequency, last, 8, 8);
> - else
> - stats->average_frequency = last << 8;
> - } else {
> - stats->average_duration = duration << 8;
> - }
> -
> - stats->last = now ?: 1;
> -
> - spin_unlock(&stats->lock);
> -}
> -
> /**
> * bch_next_delay() - update ratelimiting statistics and calculate next delay
> * @d: the struct bch_ratelimit to update
> diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
> index f61ab1bada6c..6fcb9db4f50d 100644
> --- a/drivers/md/bcache/util.h
> +++ b/drivers/md/bcache/util.h
> @@ -344,20 +344,6 @@ ssize_t bch_hprint(char *buf, int64_t v);
> bool bch_is_zero(const char *p, size_t n);
> int bch_parse_uuid(const char *s, char *uuid);
>
> -struct time_stats {
> - spinlock_t lock;
> - /*
> - * all fields are in nanoseconds, averages are ewmas stored left shifted
> - * by 8
> - */
> - uint64_t max_duration;
> - uint64_t average_duration;
> - uint64_t average_frequency;
> - uint64_t last;
> -};
> -
> -void bch_time_stats_update(struct time_stats *stats, uint64_t time);
> -
> static inline unsigned int local_clock_us(void)
> {
> return local_clock() >> 10;
> @@ -372,40 +358,16 @@ static inline unsigned int local_clock_us(void)
> sysfs_print(name ## _ ## stat ## _ ## units, \
> div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
>
> -#define sysfs_print_time_stats(stats, name, \
> - frequency_units, \
> - duration_units) \
> +#define sysfs_print_time_stats(stats, name) \
> do { \
> - __print_time_stat(stats, name, \
> - average_frequency, frequency_units); \
> - __print_time_stat(stats, name, \
> - average_duration, duration_units); \
> - sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
> - div_u64((stats)->max_duration, \
> - NSEC_PER_ ## duration_units)); \
> - \
> - sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
> - ? div_s64(local_clock() - (stats)->last, \
> - NSEC_PER_ ## frequency_units) \
> - : -1LL); \
> + if (attr == &sysfs_##name) { \
> + struct seq_buf seq; \
> + seq_buf_init(&seq, buf, PAGE_SIZE); \
> + time_stats_to_seq_buf(&seq, stats); \
> + return seq.len; \
> + } \
> } while (0)
>
> -#define sysfs_time_stats_attribute(name, \
> - frequency_units, \
> - duration_units) \
> -read_attribute(name ## _average_frequency_ ## frequency_units); \
> -read_attribute(name ## _average_duration_ ## duration_units); \
> -read_attribute(name ## _max_duration_ ## duration_units); \
> -read_attribute(name ## _last_ ## frequency_units)
> -
> -#define sysfs_time_stats_attribute_list(name, \
> - frequency_units, \
> - duration_units) \
> -&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
> -&sysfs_ ## name ## _average_duration_ ## duration_units, \
> -&sysfs_ ## name ## _max_duration_ ## duration_units, \
> -&sysfs_ ## name ## _last_ ## frequency_units,
> -
> #define ewma_add(ewma, val, weight, factor) \
> ({ \
> (ewma) *= (weight) - 1; \
> --
> 2.43.0
>

2024-01-27 15:21:14

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 3/5] bcachefs: bch2_time_stats_to_seq_buf()

On Sat, Jan 27, 2024 at 03:49:29AM +0000, Joshua Ashton wrote:
> Kernel patches need descriptions.

I think this one was pretty clear from the name and type signature :)

>
> On January 26, 2024 10:06:53 PM GMT, Kent Overstreet <[email protected]> wrote:
> >Signed-off-by: Kent Overstreet <[email protected]>
> >---
> > fs/bcachefs/super.c | 2 +
> > fs/bcachefs/util.c | 129 +++++++++++++++++++++++++++++++++++++++-----
> > fs/bcachefs/util.h | 4 ++
> > 3 files changed, 121 insertions(+), 14 deletions(-)
> >
> >diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
> >index da8697c79a97..e74534096cb5 100644
> >--- a/fs/bcachefs/super.c
> >+++ b/fs/bcachefs/super.c
> >@@ -1262,6 +1262,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
> >
> > bch2_time_stats_init(&ca->io_latency[READ]);
> > bch2_time_stats_init(&ca->io_latency[WRITE]);
> >+ ca->io_latency[READ].quantiles_enabled = true;
> >+ ca->io_latency[WRITE].quantiles_enabled = true;
> >
> > ca->mi = bch2_mi_to_cpu(member);
> >
> >diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
> >index c7cf9c6fcf9a..f2c8550c1331 100644
> >--- a/fs/bcachefs/util.c
> >+++ b/fs/bcachefs/util.c
> >@@ -505,10 +505,8 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
> >
> > void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
> > {
> >- const struct time_unit *u;
> > s64 f_mean = 0, d_mean = 0;
> >- u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
> >- int i;
> >+ u64 f_stddev = 0, d_stddev = 0;
> >
> > if (stats->buffer) {
> > int cpu;
> >@@ -607,19 +605,122 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
> >
> > printbuf_tabstops_reset(out);
> >
> >- i = eytzinger0_first(NR_QUANTILES);
> >- u = pick_time_units(stats->quantiles.entries[i].m);
> >+ if (stats->quantiles_enabled) {
> >+ int i = eytzinger0_first(NR_QUANTILES);
> >+ const struct time_unit *u =
> >+ pick_time_units(stats->quantiles.entries[i].m);
> >+ u64 last_q = 0;
> >+
> >+ prt_printf(out, "quantiles (%s):\t", u->name);
> >+ eytzinger0_for_each(i, NR_QUANTILES) {
> >+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
> >+
> >+ u64 q = max(stats->quantiles.entries[i].m, last_q);
> >+ prt_printf(out, "%llu ", div_u64(q, u->nsecs));
> >+ if (is_last)
> >+ prt_newline(out);
> >+ last_q = q;
> >+ }
> >+ }
> >+}
> >+
> >+#include <linux/seq_buf.h>
> >+
> >+static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
> >+{
> >+ const struct time_unit *u = pick_time_units(ns);
> >+
> >+ seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
> >+}
> >+
> >+void bch2_time_stats_to_seq_buf(struct seq_buf *out, struct bch2_time_stats *stats)
> >+{
> >+ s64 f_mean = 0, d_mean = 0;
> >+ u64 f_stddev = 0, d_stddev = 0;
> >+
> >+ if (stats->buffer) {
> >+ int cpu;
> >
> >- prt_printf(out, "quantiles (%s):\t", u->name);
> >- eytzinger0_for_each(i, NR_QUANTILES) {
> >- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
> >+ spin_lock_irq(&stats->lock);
> >+ for_each_possible_cpu(cpu)
> >+ __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
> >+ spin_unlock_irq(&stats->lock);
> >+ }
> >
> >- q = max(stats->quantiles.entries[i].m, last_q);
> >- prt_printf(out, "%llu ",
> >- div_u64(q, u->nsecs));
> >- if (is_last)
> >- prt_newline(out);
> >- last_q = q;
> >+ /*
> >+ * avoid divide by zero
> >+ */
> >+ if (stats->freq_stats.n) {
> >+ f_mean = mean_and_variance_get_mean(stats->freq_stats);
> >+ f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
> >+ d_mean = mean_and_variance_get_mean(stats->duration_stats);
> >+ d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
> >+ }
> >+
> >+ seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
> >+
> >+ seq_buf_printf(out, " since mount recent\n");
> >+
> >+ seq_buf_printf(out, "duration of events\n");
> >+
> >+ seq_buf_printf(out, " min: ");
> >+ seq_buf_time_units_aligned(out, stats->min_duration);
> >+ seq_buf_printf(out, "\n");
> >+
> >+ seq_buf_printf(out, " max: ");
> >+ seq_buf_time_units_aligned(out, stats->max_duration);
> >+ seq_buf_printf(out, "\n");
> >+
> >+ seq_buf_printf(out, " total: ");
> >+ seq_buf_time_units_aligned(out, stats->total_duration);
> >+ seq_buf_printf(out, "\n");
> >+
> >+ seq_buf_printf(out, " mean: ");
> >+ seq_buf_time_units_aligned(out, d_mean);
> >+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
> >+ seq_buf_printf(out, "\n");
> >+
> >+ seq_buf_printf(out, " stddev: ");
> >+ seq_buf_time_units_aligned(out, d_stddev);
> >+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
> >+ seq_buf_printf(out, "\n");
> >+
> >+ seq_buf_printf(out, "time between events\n");
> >+
> >+ seq_buf_printf(out, " min: ");
> >+ seq_buf_time_units_aligned(out, stats->min_freq);
> >+ seq_buf_printf(out, "\n");
> >+
> >+ seq_buf_printf(out, " max: ");
> >+ seq_buf_time_units_aligned(out, stats->max_freq);
> >+ seq_buf_printf(out, "\n");
> >+
> >+ seq_buf_printf(out, " mean: ");
> >+ seq_buf_time_units_aligned(out, f_mean);
> >+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
> >+ seq_buf_printf(out, "\n");
> >+
> >+ seq_buf_printf(out, " stddev: ");
> >+ seq_buf_time_units_aligned(out, f_stddev);
> >+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
> >+ seq_buf_printf(out, "\n");
> >+
> >+ if (stats->quantiles_enabled) {
> >+ int i = eytzinger0_first(NR_QUANTILES);
> >+ const struct time_unit *u =
> >+ pick_time_units(stats->quantiles.entries[i].m);
> >+ u64 last_q = 0;
> >+
> >+ prt_printf(out, "quantiles (%s):\t", u->name);
> >+ eytzinger0_for_each(i, NR_QUANTILES) {
> >+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
> >+
> >+ u64 q = max(stats->quantiles.entries[i].m, last_q);
> >+ seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
> >+ if (is_last)
> >+ seq_buf_printf(out, "\n");
> >+ last_q = q;
> >+ }
> > }
> > }
> > #else
> >diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
> >index c3b11c3d24ea..7ff2d4fe26f6 100644
> >--- a/fs/bcachefs/util.h
> >+++ b/fs/bcachefs/util.h
> >@@ -382,6 +382,7 @@ struct bch2_time_stat_buffer {
> >
> > struct bch2_time_stats {
> > spinlock_t lock;
> >+ bool quantiles_enabled;
> > /* all fields are in nanoseconds */
> > u64 min_duration;
> > u64 max_duration;
> >@@ -435,6 +436,9 @@ static inline bool track_event_change(struct bch2_time_stats *stats,
> >
> > void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
> >
> >+struct seq_buf;
> >+void bch2_time_stats_to_seq_buf(struct seq_buf *, struct bch2_time_stats *);
> >+
> > void bch2_time_stats_exit(struct bch2_time_stats *);
> > void bch2_time_stats_init(struct bch2_time_stats *);
> >
> >--
> >2.43.0
> >
> >
>
> - Joshie ????✨

2024-01-27 17:45:59

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 3/5] bcachefs: bch2_time_stats_to_seq_buf()

On Sat, Jan 27, 2024 at 10:20:48AM -0500, Kent Overstreet wrote:
> On Sat, Jan 27, 2024 at 03:49:29AM +0000, Joshua Ashton wrote:
> > Kernel patches need descriptions.
>
> I think this one was pretty clear from the name and type signature :)

I was mildly confused by this patch until I read the /next/ patch and
realized "Oh, patch 3 ports the timestats rendering code to seq_buf so
that patch 4 can lift it to lib/."

But I also thought the purpose of all this was kinda obvious from the
cover letter, which was why I didn't say anything yesterday afternoon.

I really don't care about these sorts of things. IMO the obviously
interesting meat of the patchset is the patch that hoisted this to lib/
and the critical review is brainstorming how I'd use this new piece of
functionality, deciding if the api was ergonomic enough to start
prototyping upon, and sending a conditional RVB if I demonstrated enough
understanding of the problem being solved.

As for rearranging deck chairs inside fs/bcachefs, that's Kent's thing.

--D

> >
> > On January 26, 2024 10:06:53 PM GMT, Kent Overstreet <[email protected]> wrote:
> > >Signed-off-by: Kent Overstreet <[email protected]>
> > >---
> > > fs/bcachefs/super.c | 2 +
> > > fs/bcachefs/util.c | 129 +++++++++++++++++++++++++++++++++++++++-----
> > > fs/bcachefs/util.h | 4 ++
> > > 3 files changed, 121 insertions(+), 14 deletions(-)
> > >
> > >diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
> > >index da8697c79a97..e74534096cb5 100644
> > >--- a/fs/bcachefs/super.c
> > >+++ b/fs/bcachefs/super.c
> > >@@ -1262,6 +1262,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
> > >
> > > bch2_time_stats_init(&ca->io_latency[READ]);
> > > bch2_time_stats_init(&ca->io_latency[WRITE]);
> > >+ ca->io_latency[READ].quantiles_enabled = true;
> > >+ ca->io_latency[WRITE].quantiles_enabled = true;
> > >
> > > ca->mi = bch2_mi_to_cpu(member);
> > >
> > >diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
> > >index c7cf9c6fcf9a..f2c8550c1331 100644
> > >--- a/fs/bcachefs/util.c
> > >+++ b/fs/bcachefs/util.c
> > >@@ -505,10 +505,8 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
> > >
> > > void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
> > > {
> > >- const struct time_unit *u;
> > > s64 f_mean = 0, d_mean = 0;
> > >- u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
> > >- int i;
> > >+ u64 f_stddev = 0, d_stddev = 0;
> > >
> > > if (stats->buffer) {
> > > int cpu;
> > >@@ -607,19 +605,122 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
> > >
> > > printbuf_tabstops_reset(out);
> > >
> > >- i = eytzinger0_first(NR_QUANTILES);
> > >- u = pick_time_units(stats->quantiles.entries[i].m);
> > >+ if (stats->quantiles_enabled) {
> > >+ int i = eytzinger0_first(NR_QUANTILES);
> > >+ const struct time_unit *u =
> > >+ pick_time_units(stats->quantiles.entries[i].m);
> > >+ u64 last_q = 0;
> > >+
> > >+ prt_printf(out, "quantiles (%s):\t", u->name);
> > >+ eytzinger0_for_each(i, NR_QUANTILES) {
> > >+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
> > >+
> > >+ u64 q = max(stats->quantiles.entries[i].m, last_q);
> > >+ prt_printf(out, "%llu ", div_u64(q, u->nsecs));
> > >+ if (is_last)
> > >+ prt_newline(out);
> > >+ last_q = q;
> > >+ }
> > >+ }
> > >+}
> > >+
> > >+#include <linux/seq_buf.h>
> > >+
> > >+static void seq_buf_time_units_aligned(struct seq_buf *out, u64 ns)
> > >+{
> > >+ const struct time_unit *u = pick_time_units(ns);
> > >+
> > >+ seq_buf_printf(out, "%8llu %s", div64_u64(ns, u->nsecs), u->name);
> > >+}
> > >+
> > >+void bch2_time_stats_to_seq_buf(struct seq_buf *out, struct bch2_time_stats *stats)
> > >+{
> > >+ s64 f_mean = 0, d_mean = 0;
> > >+ u64 f_stddev = 0, d_stddev = 0;
> > >+
> > >+ if (stats->buffer) {
> > >+ int cpu;
> > >
> > >- prt_printf(out, "quantiles (%s):\t", u->name);
> > >- eytzinger0_for_each(i, NR_QUANTILES) {
> > >- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
> > >+ spin_lock_irq(&stats->lock);
> > >+ for_each_possible_cpu(cpu)
> > >+ __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
> > >+ spin_unlock_irq(&stats->lock);
> > >+ }
> > >
> > >- q = max(stats->quantiles.entries[i].m, last_q);
> > >- prt_printf(out, "%llu ",
> > >- div_u64(q, u->nsecs));
> > >- if (is_last)
> > >- prt_newline(out);
> > >- last_q = q;
> > >+ /*
> > >+ * avoid divide by zero
> > >+ */
> > >+ if (stats->freq_stats.n) {
> > >+ f_mean = mean_and_variance_get_mean(stats->freq_stats);
> > >+ f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
> > >+ d_mean = mean_and_variance_get_mean(stats->duration_stats);
> > >+ d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
> > >+ }
> > >+
> > >+ seq_buf_printf(out, "count: %llu\n", stats->duration_stats.n);
> > >+
> > >+ seq_buf_printf(out, " since mount recent\n");
> > >+
> > >+ seq_buf_printf(out, "duration of events\n");
> > >+
> > >+ seq_buf_printf(out, " min: ");
> > >+ seq_buf_time_units_aligned(out, stats->min_duration);
> > >+ seq_buf_printf(out, "\n");
> > >+
> > >+ seq_buf_printf(out, " max: ");
> > >+ seq_buf_time_units_aligned(out, stats->max_duration);
> > >+ seq_buf_printf(out, "\n");
> > >+
> > >+ seq_buf_printf(out, " total: ");
> > >+ seq_buf_time_units_aligned(out, stats->total_duration);
> > >+ seq_buf_printf(out, "\n");
> > >+
> > >+ seq_buf_printf(out, " mean: ");
> > >+ seq_buf_time_units_aligned(out, d_mean);
> > >+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
> > >+ seq_buf_printf(out, "\n");
> > >+
> > >+ seq_buf_printf(out, " stddev: ");
> > >+ seq_buf_time_units_aligned(out, d_stddev);
> > >+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
> > >+ seq_buf_printf(out, "\n");
> > >+
> > >+ seq_buf_printf(out, "time between events\n");
> > >+
> > >+ seq_buf_printf(out, " min: ");
> > >+ seq_buf_time_units_aligned(out, stats->min_freq);
> > >+ seq_buf_printf(out, "\n");
> > >+
> > >+ seq_buf_printf(out, " max: ");
> > >+ seq_buf_time_units_aligned(out, stats->max_freq);
> > >+ seq_buf_printf(out, "\n");
> > >+
> > >+ seq_buf_printf(out, " mean: ");
> > >+ seq_buf_time_units_aligned(out, f_mean);
> > >+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
> > >+ seq_buf_printf(out, "\n");
> > >+
> > >+ seq_buf_printf(out, " stddev: ");
> > >+ seq_buf_time_units_aligned(out, f_stddev);
> > >+ seq_buf_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
> > >+ seq_buf_printf(out, "\n");
> > >+
> > >+ if (stats->quantiles_enabled) {
> > >+ int i = eytzinger0_first(NR_QUANTILES);
> > >+ const struct time_unit *u =
> > >+ pick_time_units(stats->quantiles.entries[i].m);
> > >+ u64 last_q = 0;
> > >+
> > >+ prt_printf(out, "quantiles (%s):\t", u->name);
> > >+ eytzinger0_for_each(i, NR_QUANTILES) {
> > >+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
> > >+
> > >+ u64 q = max(stats->quantiles.entries[i].m, last_q);
> > >+ seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
> > >+ if (is_last)
> > >+ seq_buf_printf(out, "\n");
> > >+ last_q = q;
> > >+ }
> > > }
> > > }
> > > #else
> > >diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
> > >index c3b11c3d24ea..7ff2d4fe26f6 100644
> > >--- a/fs/bcachefs/util.h
> > >+++ b/fs/bcachefs/util.h
> > >@@ -382,6 +382,7 @@ struct bch2_time_stat_buffer {
> > >
> > > struct bch2_time_stats {
> > > spinlock_t lock;
> > >+ bool quantiles_enabled;
> > > /* all fields are in nanoseconds */
> > > u64 min_duration;
> > > u64 max_duration;
> > >@@ -435,6 +436,9 @@ static inline bool track_event_change(struct bch2_time_stats *stats,
> > >
> > > void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
> > >
> > >+struct seq_buf;
> > >+void bch2_time_stats_to_seq_buf(struct seq_buf *, struct bch2_time_stats *);
> > >+
> > > void bch2_time_stats_exit(struct bch2_time_stats *);
> > > void bch2_time_stats_init(struct bch2_time_stats *);
> > >
> > >--
> > >2.43.0
> > >
> > >
> >
> > - Joshie ????✨
>

2024-02-01 20:24:40

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 4/5] time_stats: Promote to lib/

Note that I needed the following patch to fix some build errors:

diff --git a/lib/time_stats.c b/lib/time_stats.c
index a5b9f149e2c6a..6618b1c35d700 100644
--- a/lib/time_stats.c
+++ b/lib/time_stats.c
@@ -6,6 +6,7 @@
#include <linux/time.h>
#include <linux/time_stats.h>
#include <linux/spinlock.h>
+#include <linux/module.h>

static const struct time_unit time_units[] = {
{ "ns", 1 },
@@ -29,6 +30,7 @@ const struct time_unit *pick_time_units(u64 ns)

return u;
}
+EXPORT_SYMBOL_GPL(pick_time_units);

static void quantiles_update(struct quantiles *q, u64 v)
{
@@ -262,3 +264,7 @@ void time_stats_init(struct time_stats *stats)
spin_lock_init(&stats->lock);
}
EXPORT_SYMBOL_GPL(time_stats_init);
+
+MODULE_AUTHOR("Kent Overstreet?");
+MODULE_DESCRIPTION("time statistics library");
+MODULE_LICENSE("GPL");

2024-02-02 07:30:51

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 1/5] mean and variance: Promote to lib/math

On Fri, Jan 26, 2024 at 05:06:51PM -0500, Kent Overstreet wrote:
> Small statistics library, for taking in a series of value and computing
> mean, weighted mean, standard deviation and weighted deviation.
>
> The main use case is for statistics on latency measurements.

Oh, heh, I didn't realize this was a patch and not a cover letter.

A few things I noticed while adding timestats to xfs -- pahole says this
about the weighted mean and variance object:

struct mean_and_variance_weighted {
/* typedef bool */ _Bool init; /* 0 1 */
/* typedef u8 -> __u8 */ unsigned char weight; /* 1 1 */

/* XXX 6 bytes hole, try to pack */

/* typedef s64 -> __s64 */ long long int mean; /* 8 8 */
/* typedef u64 -> __u64 */ long long unsigned int variance; /* 16 8 */

/* size: 24, cachelines: 1, members: 4 */
/* sum members: 18, holes: 1, sum holes: 6 */
/* last cacheline: 24 bytes */
};

AFAICT the init flag isn't used, and the u8 wastes 6 bytes of space.
Two of these get plugged into struct time_stats, which means we waste 12
bytes of space on a ~464 byte structure. Removing quantiles support
from that shrinks the struct down to 208 bytes.

Any chance we could pass in the weights as a function parameter so that
we could shrink this to 16 bytes? If we do that and rearrange
time_stats, the whole thing goes down to 192 bytes, which means I can
spray in twice as many timestats.

I also noticed that struct time_stat_buffer is:

struct time_stat_buffer {
unsigned int nr; /* 0 4 */

/* XXX 4 bytes hole, try to pack */

struct time_stat_buffer_entry {
/* typedef u64 -> __u64 */ long long unsigned int start; /* 8 8 */
/* typedef u64 -> __u64 */ long long unsigned int end; /* 16 8 */
} entries[32]; /* 8 512 */

/* size: 520, cachelines: 9, members: 2 */
/* sum members: 516, holes: 1, sum holes: 4 */
/* last cacheline: 8 bytes */
};

I wonder, if entries[] shrank to 31 entries, this would align to 512b;
would that make for more efficient allocations? I tried to follow
alloc_percpu_gfp and got caught in a twisty mess of macros.

--D

FWIW the full time_stats struct ended up like this after I turned off
quantiles and did all the lazy rearranging I could do without removing
init or weight:

struct time_stats {
/* typedef spinlock_t */ struct spinlock {
union {
struct raw_spinlock {
/* typedef arch_spinlock_t */ struct qspinlock {
union {
/* typedef atomic_t */ struct {
int counter; /* 0 4 */
} val; /* 0 4 */
struct {
/* typedef u8 -> __u8 */ unsigned char locked; /* 0 1 */
/* typedef u8 -> __u8 */ unsigned char pending; /* 1 1 */
}; /* 0 2 */
struct {
/* typedef u16 -> __u16 */ short unsigned int locked_pending; /* 0 2 */
/* typedef u16 -> __u16 */ short unsigned int tail; /* 2 2 */
}; /* 0 4 */
}; /* 0 4 */
} raw_lock; /* 0 4 */
unsigned int magic; /* 4 4 */
unsigned int owner_cpu; /* 8 4 */

/* XXX 4 bytes hole, try to pack */

void * owner; /* 16 8 */
}rlock; /* 0 24 */
}; /* 0 24 */
} lock; /* 0 24 */
/* typedef u64 -> __u64 */ long long unsigned int min_duration; /* 24 8 */
/* typedef u64 -> __u64 */ long long unsigned int max_duration; /* 32 8 */
/* typedef u64 -> __u64 */ long long unsigned int total_duration; /* 40 8 */
/* typedef u64 -> __u64 */ long long unsigned int max_freq; /* 48 8 */
/* typedef u64 -> __u64 */ long long unsigned int min_freq; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
/* typedef u64 -> __u64 */ long long unsigned int last_event; /* 64 8 */
/* typedef u64 -> __u64 */ long long unsigned int last_event_start; /* 72 8 */
struct mean_and_variance {
/* typedef s64 -> __s64 */ long long int n; /* 80 8 */
/* typedef s64 -> __s64 */ long long int sum; /* 88 8 */
/* typedef u128_u */ struct {
__int128 unsigned v; /* 96 16 */
} sum_squares __attribute__((__aligned__(16))) __attribute__((__aligned__(16))); /* 96 16 */
} __attribute__((__aligned__(16)))duration_stats __attribute__((__aligned__(16))); /* 80 32 */
struct mean_and_variance {
/* typedef s64 -> __s64 */ long long int n; /* 112 8 */
/* typedef s64 -> __s64 */ long long int sum; /* 120 8 */
/* --- cacheline 2 boundary (128 bytes) --- */
/* typedef u128_u */ struct {
__int128 unsigned v; /* 128 16 */
} sum_squares __attribute__((__aligned__(16))) __attribute__((__aligned__(16))); /* 128 16 */
} __attribute__((__aligned__(16)))freq_stats __attribute__((__aligned__(16))); /* 112 32 */
struct mean_and_variance_weighted {
/* typedef bool */ _Bool init; /* 144 1 */
/* typedef u8 -> __u8 */ unsigned char weight; /* 145 1 */

/* XXX 6 bytes hole, try to pack */

/* typedef s64 -> __s64 */ long long int mean; /* 152 8 */
/* typedef u64 -> __u64 */ long long unsigned int variance; /* 160 8 */
}duration_stats_weighted; /* 144 24 */
struct mean_and_variance_weighted {
/* typedef bool */ _Bool init; /* 168 1 */
/* typedef u8 -> __u8 */ unsigned char weight; /* 169 1 */

/* XXX 6 bytes hole, try to pack */

/* typedef s64 -> __s64 */ long long int mean; /* 176 8 */
/* typedef u64 -> __u64 */ long long unsigned int variance; /* 184 8 */
}freq_stats_weighted; /* 168 24 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct time_stat_buffer * buffer; /* 192 8 */

/* size: 208, cachelines: 4, members: 13 */
/* padding: 8 */
/* forced alignments: 2 */
/* last cacheline: 16 bytes */
} __attribute__((__aligned__(16)));


> Signed-off-by: Kent Overstreet <[email protected]>
> Cc: Daniel Hill <[email protected]>
> Cc: Darrick J. Wong <[email protected]>
> ---
> MAINTAINERS | 9 +++++++++
> fs/bcachefs/Kconfig | 10 +---------
> fs/bcachefs/Makefile | 3 ---
> fs/bcachefs/util.c | 2 +-
> fs/bcachefs/util.h | 3 +--
> {fs/bcachefs => include/linux}/mean_and_variance.h | 0
> lib/Kconfig.debug | 9 +++++++++
> lib/math/Kconfig | 3 +++
> lib/math/Makefile | 2 ++
> {fs/bcachefs => lib/math}/mean_and_variance.c | 3 +--
> {fs/bcachefs => lib/math}/mean_and_variance_test.c | 3 +--
> 11 files changed, 28 insertions(+), 19 deletions(-)
> rename {fs/bcachefs => include/linux}/mean_and_variance.h (100%)
> rename {fs/bcachefs => lib/math}/mean_and_variance.c (99%)
> rename {fs/bcachefs => lib/math}/mean_and_variance_test.c (99%)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 8d1052fa6a69..de635cfd354d 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13379,6 +13379,15 @@ S: Maintained
> F: drivers/net/mdio/mdio-regmap.c
> F: include/linux/mdio/mdio-regmap.h
>
> +MEAN AND VARIANCE LIBRARY
> +M: Daniel B. Hill <[email protected]>
> +M: Kent Overstreet <[email protected]>
> +S: Maintained
> +T: git https://github.com/YellowOnion/linux/
> +F: include/linux/mean_and_variance.h
> +F: lib/math/mean_and_variance.c
> +F: lib/math/mean_and_variance_test.c
> +
> MEASUREMENT COMPUTING CIO-DAC IIO DRIVER
> M: William Breathitt Gray <[email protected]>
> L: [email protected]
> diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
> index 5cdfef3b551a..72d1179262b3 100644
> --- a/fs/bcachefs/Kconfig
> +++ b/fs/bcachefs/Kconfig
> @@ -24,6 +24,7 @@ config BCACHEFS_FS
> select XXHASH
> select SRCU
> select SYMBOLIC_ERRNAME
> + select MEAN_AND_VARIANCE
> help
> The bcachefs filesystem - a modern, copy on write filesystem, with
> support for multiple devices, compression, checksumming, etc.
> @@ -86,12 +87,3 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
> Instead of immediately sleeping when attempting to take a six lock that
> is held by another thread, spin for a short while, as long as the
> thread owning the lock is running.
> -
> -config MEAN_AND_VARIANCE_UNIT_TEST
> - tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
> - depends on KUNIT
> - depends on BCACHEFS_FS
> - default KUNIT_ALL_TESTS
> - help
> - This option enables the kunit tests for mean_and_variance module.
> - If unsure, say N.
> diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
> index 1a05cecda7cc..b11ba74b8ad4 100644
> --- a/fs/bcachefs/Makefile
> +++ b/fs/bcachefs/Makefile
> @@ -57,7 +57,6 @@ bcachefs-y := \
> keylist.o \
> logged_ops.o \
> lru.o \
> - mean_and_variance.o \
> migrate.o \
> move.o \
> movinggc.o \
> @@ -88,5 +87,3 @@ bcachefs-y := \
> util.o \
> varint.o \
> xattr.o
> -
> -obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
> diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
> index 56b815fd9fc6..d7ea95abb9df 100644
> --- a/fs/bcachefs/util.c
> +++ b/fs/bcachefs/util.c
> @@ -22,9 +22,9 @@
> #include <linux/string.h>
> #include <linux/types.h>
> #include <linux/sched/clock.h>
> +#include <linux/mean_and_variance.h>
>
> #include "eytzinger.h"
> -#include "mean_and_variance.h"
> #include "util.h"
>
> static const char si_units[] = "?kMGTPEZY";
> diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
> index b414736d59a5..0059481995ef 100644
> --- a/fs/bcachefs/util.h
> +++ b/fs/bcachefs/util.h
> @@ -17,8 +17,7 @@
> #include <linux/slab.h>
> #include <linux/vmalloc.h>
> #include <linux/workqueue.h>
> -
> -#include "mean_and_variance.h"
> +#include <linux/mean_and_variance.h>
>
> #include "darray.h"
>
> diff --git a/fs/bcachefs/mean_and_variance.h b/include/linux/mean_and_variance.h
> similarity index 100%
> rename from fs/bcachefs/mean_and_variance.h
> rename to include/linux/mean_and_variance.h
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 975a07f9f1cc..817ddfe132cd 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -2191,6 +2191,15 @@ config CPUMASK_KUNIT_TEST
>
> If unsure, say N.
>
> +config MEAN_AND_VARIANCE_UNIT_TEST
> + tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
> + depends on KUNIT
> + select MEAN_AND_VARIANCE
> + default KUNIT_ALL_TESTS
> + help
> + This option enables the kunit tests for mean_and_variance module.
> + If unsure, say N.
> +
> config TEST_LIST_SORT
> tristate "Linked list sorting test" if !KUNIT_ALL_TESTS
> depends on KUNIT
> diff --git a/lib/math/Kconfig b/lib/math/Kconfig
> index 0634b428d0cb..7530ae9a3584 100644
> --- a/lib/math/Kconfig
> +++ b/lib/math/Kconfig
> @@ -15,3 +15,6 @@ config PRIME_NUMBERS
>
> config RATIONAL
> tristate
> +
> +config MEAN_AND_VARIANCE
> + tristate
> diff --git a/lib/math/Makefile b/lib/math/Makefile
> index 91fcdb0c9efe..8cdfa13a67ce 100644
> --- a/lib/math/Makefile
> +++ b/lib/math/Makefile
> @@ -4,6 +4,8 @@ obj-y += div64.o gcd.o lcm.o int_log.o int_pow.o int_sqrt.o reciprocal_div.o
> obj-$(CONFIG_CORDIC) += cordic.o
> obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
> obj-$(CONFIG_RATIONAL) += rational.o
> +obj-$(CONFIG_MEAN_AND_VARIANCE) += mean_and_variance.o
>
> obj-$(CONFIG_TEST_DIV64) += test_div64.o
> obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o
> +obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
> diff --git a/fs/bcachefs/mean_and_variance.c b/lib/math/mean_and_variance.c
> similarity index 99%
> rename from fs/bcachefs/mean_and_variance.c
> rename to lib/math/mean_and_variance.c
> index bf0ef668fd38..ba90293204ba 100644
> --- a/fs/bcachefs/mean_and_variance.c
> +++ b/lib/math/mean_and_variance.c
> @@ -40,10 +40,9 @@
> #include <linux/limits.h>
> #include <linux/math.h>
> #include <linux/math64.h>
> +#include <linux/mean_and_variance.h>
> #include <linux/module.h>
>
> -#include "mean_and_variance.h"
> -
> u128_u u128_div(u128_u n, u64 d)
> {
> u128_u r;
> diff --git a/fs/bcachefs/mean_and_variance_test.c b/lib/math/mean_and_variance_test.c
> similarity index 99%
> rename from fs/bcachefs/mean_and_variance_test.c
> rename to lib/math/mean_and_variance_test.c
> index 019583c3ca0e..f45591a169d8 100644
> --- a/fs/bcachefs/mean_and_variance_test.c
> +++ b/lib/math/mean_and_variance_test.c
> @@ -1,7 +1,6 @@
> // SPDX-License-Identifier: GPL-2.0
> #include <kunit/test.h>
> -
> -#include "mean_and_variance.h"
> +#include <linux/mean_and_variance.h>
>
> #define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
>
> --
> 2.43.0
>
>

2024-02-02 09:57:16

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 1/5] mean and variance: Promote to lib/math

On Thu, Feb 01, 2024 at 11:30:34PM -0800, Darrick J. Wong wrote:
> On Fri, Jan 26, 2024 at 05:06:51PM -0500, Kent Overstreet wrote:
> > Small statistics library, for taking in a series of value and computing
> > mean, weighted mean, standard deviation and weighted deviation.
> >
> > The main use case is for statistics on latency measurements.
>
> Oh, heh, I didn't realize this was a patch and not a cover letter.
>
> A few things I noticed while adding timestats to xfs -- pahole says this
> about the weighted mean and variance object:
>
> struct mean_and_variance_weighted {
> /* typedef bool */ _Bool init; /* 0 1 */
> /* typedef u8 -> __u8 */ unsigned char weight; /* 1 1 */
>
> /* XXX 6 bytes hole, try to pack */
>
> /* typedef s64 -> __s64 */ long long int mean; /* 8 8 */
> /* typedef u64 -> __u64 */ long long unsigned int variance; /* 16 8 */
>
> /* size: 24, cachelines: 1, members: 4 */
> /* sum members: 18, holes: 1, sum holes: 6 */
> /* last cacheline: 24 bytes */
> };
>
> AFAICT the init flag isn't used, and the u8 wastes 6 bytes of space.
> Two of these get plugged into struct time_stats, which means we waste 12
> bytes of space on a ~464 byte structure. Removing quantiles support
> from that shrinks the struct down to 208 bytes.
>
> Any chance we could pass in the weights as a function parameter so that
> we could shrink this to 16 bytes? If we do that and rearrange
> time_stats, the whole thing goes down to 192 bytes, which means I can
> spray in twice as many timestats.

I think that works - 3 cachelines is a nice neat size.

>
> I also noticed that struct time_stat_buffer is:
>
> struct time_stat_buffer {
> unsigned int nr; /* 0 4 */
>
> /* XXX 4 bytes hole, try to pack */
>
> struct time_stat_buffer_entry {
> /* typedef u64 -> __u64 */ long long unsigned int start; /* 8 8 */
> /* typedef u64 -> __u64 */ long long unsigned int end; /* 16 8 */
> } entries[32]; /* 8 512 */
>
> /* size: 520, cachelines: 9, members: 2 */
> /* sum members: 516, holes: 1, sum holes: 4 */
> /* last cacheline: 8 bytes */
> };
>
> I wonder, if entries[] shrank to 31 entries, this would align to 512b;
> would that make for more efficient allocations? I tried to follow
> alloc_percpu_gfp and got caught in a twisty mess of macros.

yes, yes it would

the percpu allocator is really simplistic, it's just finding the first
free region >= the requested size.

Also, there's some more savings that could be had if someone wanted to
derive the weighted version of median absolute deviation as Linus
suggested awhile back, but that one was above my stastitical pay
grade...

>
> --D
>
> FWIW the full time_stats struct ended up like this after I turned off
> quantiles and did all the lazy rearranging I could do without removing
> init or weight:
>
> struct time_stats {
> /* typedef spinlock_t */ struct spinlock {
> union {
> struct raw_spinlock {
> /* typedef arch_spinlock_t */ struct qspinlock {
> union {
> /* typedef atomic_t */ struct {
> int counter; /* 0 4 */
> } val; /* 0 4 */
> struct {
> /* typedef u8 -> __u8 */ unsigned char locked; /* 0 1 */
> /* typedef u8 -> __u8 */ unsigned char pending; /* 1 1 */
> }; /* 0 2 */
> struct {
> /* typedef u16 -> __u16 */ short unsigned int locked_pending; /* 0 2 */
> /* typedef u16 -> __u16 */ short unsigned int tail; /* 2 2 */
> }; /* 0 4 */
> }; /* 0 4 */
> } raw_lock; /* 0 4 */
> unsigned int magic; /* 4 4 */
> unsigned int owner_cpu; /* 8 4 */
>
> /* XXX 4 bytes hole, try to pack */
>
> void * owner; /* 16 8 */
> }rlock; /* 0 24 */
> }; /* 0 24 */
> } lock; /* 0 24 */
> /* typedef u64 -> __u64 */ long long unsigned int min_duration; /* 24 8 */
> /* typedef u64 -> __u64 */ long long unsigned int max_duration; /* 32 8 */
> /* typedef u64 -> __u64 */ long long unsigned int total_duration; /* 40 8 */
> /* typedef u64 -> __u64 */ long long unsigned int max_freq; /* 48 8 */
> /* typedef u64 -> __u64 */ long long unsigned int min_freq; /* 56 8 */
> /* --- cacheline 1 boundary (64 bytes) --- */
> /* typedef u64 -> __u64 */ long long unsigned int last_event; /* 64 8 */
> /* typedef u64 -> __u64 */ long long unsigned int last_event_start; /* 72 8 */
> struct mean_and_variance {
> /* typedef s64 -> __s64 */ long long int n; /* 80 8 */
> /* typedef s64 -> __s64 */ long long int sum; /* 88 8 */
> /* typedef u128_u */ struct {
> __int128 unsigned v; /* 96 16 */
> } sum_squares __attribute__((__aligned__(16))) __attribute__((__aligned__(16))); /* 96 16 */
> } __attribute__((__aligned__(16)))duration_stats __attribute__((__aligned__(16))); /* 80 32 */
> struct mean_and_variance {
> /* typedef s64 -> __s64 */ long long int n; /* 112 8 */
> /* typedef s64 -> __s64 */ long long int sum; /* 120 8 */
> /* --- cacheline 2 boundary (128 bytes) --- */
> /* typedef u128_u */ struct {
> __int128 unsigned v; /* 128 16 */
> } sum_squares __attribute__((__aligned__(16))) __attribute__((__aligned__(16))); /* 128 16 */
> } __attribute__((__aligned__(16)))freq_stats __attribute__((__aligned__(16))); /* 112 32 */
> struct mean_and_variance_weighted {
> /* typedef bool */ _Bool init; /* 144 1 */
> /* typedef u8 -> __u8 */ unsigned char weight; /* 145 1 */
>
> /* XXX 6 bytes hole, try to pack */
>
> /* typedef s64 -> __s64 */ long long int mean; /* 152 8 */
> /* typedef u64 -> __u64 */ long long unsigned int variance; /* 160 8 */
> }duration_stats_weighted; /* 144 24 */
> struct mean_and_variance_weighted {
> /* typedef bool */ _Bool init; /* 168 1 */
> /* typedef u8 -> __u8 */ unsigned char weight; /* 169 1 */
>
> /* XXX 6 bytes hole, try to pack */
>
> /* typedef s64 -> __s64 */ long long int mean; /* 176 8 */
> /* typedef u64 -> __u64 */ long long unsigned int variance; /* 184 8 */
> }freq_stats_weighted; /* 168 24 */
> /* --- cacheline 3 boundary (192 bytes) --- */
> struct time_stat_buffer * buffer; /* 192 8 */
>
> /* size: 208, cachelines: 4, members: 13 */
> /* padding: 8 */
> /* forced alignments: 2 */
> /* last cacheline: 16 bytes */
> } __attribute__((__aligned__(16)));
>
>
> > Signed-off-by: Kent Overstreet <[email protected]>
> > Cc: Daniel Hill <[email protected]>
> > Cc: Darrick J. Wong <[email protected]>
> > ---
> > MAINTAINERS | 9 +++++++++
> > fs/bcachefs/Kconfig | 10 +---------
> > fs/bcachefs/Makefile | 3 ---
> > fs/bcachefs/util.c | 2 +-
> > fs/bcachefs/util.h | 3 +--
> > {fs/bcachefs => include/linux}/mean_and_variance.h | 0
> > lib/Kconfig.debug | 9 +++++++++
> > lib/math/Kconfig | 3 +++
> > lib/math/Makefile | 2 ++
> > {fs/bcachefs => lib/math}/mean_and_variance.c | 3 +--
> > {fs/bcachefs => lib/math}/mean_and_variance_test.c | 3 +--
> > 11 files changed, 28 insertions(+), 19 deletions(-)
> > rename {fs/bcachefs => include/linux}/mean_and_variance.h (100%)
> > rename {fs/bcachefs => lib/math}/mean_and_variance.c (99%)
> > rename {fs/bcachefs => lib/math}/mean_and_variance_test.c (99%)
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index 8d1052fa6a69..de635cfd354d 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -13379,6 +13379,15 @@ S: Maintained
> > F: drivers/net/mdio/mdio-regmap.c
> > F: include/linux/mdio/mdio-regmap.h
> >
> > +MEAN AND VARIANCE LIBRARY
> > +M: Daniel B. Hill <[email protected]>
> > +M: Kent Overstreet <[email protected]>
> > +S: Maintained
> > +T: git https://github.com/YellowOnion/linux/
> > +F: include/linux/mean_and_variance.h
> > +F: lib/math/mean_and_variance.c
> > +F: lib/math/mean_and_variance_test.c
> > +
> > MEASUREMENT COMPUTING CIO-DAC IIO DRIVER
> > M: William Breathitt Gray <[email protected]>
> > L: [email protected]
> > diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
> > index 5cdfef3b551a..72d1179262b3 100644
> > --- a/fs/bcachefs/Kconfig
> > +++ b/fs/bcachefs/Kconfig
> > @@ -24,6 +24,7 @@ config BCACHEFS_FS
> > select XXHASH
> > select SRCU
> > select SYMBOLIC_ERRNAME
> > + select MEAN_AND_VARIANCE
> > help
> > The bcachefs filesystem - a modern, copy on write filesystem, with
> > support for multiple devices, compression, checksumming, etc.
> > @@ -86,12 +87,3 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
> > Instead of immediately sleeping when attempting to take a six lock that
> > is held by another thread, spin for a short while, as long as the
> > thread owning the lock is running.
> > -
> > -config MEAN_AND_VARIANCE_UNIT_TEST
> > - tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
> > - depends on KUNIT
> > - depends on BCACHEFS_FS
> > - default KUNIT_ALL_TESTS
> > - help
> > - This option enables the kunit tests for mean_and_variance module.
> > - If unsure, say N.
> > diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
> > index 1a05cecda7cc..b11ba74b8ad4 100644
> > --- a/fs/bcachefs/Makefile
> > +++ b/fs/bcachefs/Makefile
> > @@ -57,7 +57,6 @@ bcachefs-y := \
> > keylist.o \
> > logged_ops.o \
> > lru.o \
> > - mean_and_variance.o \
> > migrate.o \
> > move.o \
> > movinggc.o \
> > @@ -88,5 +87,3 @@ bcachefs-y := \
> > util.o \
> > varint.o \
> > xattr.o
> > -
> > -obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
> > diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
> > index 56b815fd9fc6..d7ea95abb9df 100644
> > --- a/fs/bcachefs/util.c
> > +++ b/fs/bcachefs/util.c
> > @@ -22,9 +22,9 @@
> > #include <linux/string.h>
> > #include <linux/types.h>
> > #include <linux/sched/clock.h>
> > +#include <linux/mean_and_variance.h>
> >
> > #include "eytzinger.h"
> > -#include "mean_and_variance.h"
> > #include "util.h"
> >
> > static const char si_units[] = "?kMGTPEZY";
> > diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
> > index b414736d59a5..0059481995ef 100644
> > --- a/fs/bcachefs/util.h
> > +++ b/fs/bcachefs/util.h
> > @@ -17,8 +17,7 @@
> > #include <linux/slab.h>
> > #include <linux/vmalloc.h>
> > #include <linux/workqueue.h>
> > -
> > -#include "mean_and_variance.h"
> > +#include <linux/mean_and_variance.h>
> >
> > #include "darray.h"
> >
> > diff --git a/fs/bcachefs/mean_and_variance.h b/include/linux/mean_and_variance.h
> > similarity index 100%
> > rename from fs/bcachefs/mean_and_variance.h
> > rename to include/linux/mean_and_variance.h
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index 975a07f9f1cc..817ddfe132cd 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -2191,6 +2191,15 @@ config CPUMASK_KUNIT_TEST
> >
> > If unsure, say N.
> >
> > +config MEAN_AND_VARIANCE_UNIT_TEST
> > + tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
> > + depends on KUNIT
> > + select MEAN_AND_VARIANCE
> > + default KUNIT_ALL_TESTS
> > + help
> > + This option enables the kunit tests for mean_and_variance module.
> > + If unsure, say N.
> > +
> > config TEST_LIST_SORT
> > tristate "Linked list sorting test" if !KUNIT_ALL_TESTS
> > depends on KUNIT
> > diff --git a/lib/math/Kconfig b/lib/math/Kconfig
> > index 0634b428d0cb..7530ae9a3584 100644
> > --- a/lib/math/Kconfig
> > +++ b/lib/math/Kconfig
> > @@ -15,3 +15,6 @@ config PRIME_NUMBERS
> >
> > config RATIONAL
> > tristate
> > +
> > +config MEAN_AND_VARIANCE
> > + tristate
> > diff --git a/lib/math/Makefile b/lib/math/Makefile
> > index 91fcdb0c9efe..8cdfa13a67ce 100644
> > --- a/lib/math/Makefile
> > +++ b/lib/math/Makefile
> > @@ -4,6 +4,8 @@ obj-y += div64.o gcd.o lcm.o int_log.o int_pow.o int_sqrt.o reciprocal_div.o
> > obj-$(CONFIG_CORDIC) += cordic.o
> > obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
> > obj-$(CONFIG_RATIONAL) += rational.o
> > +obj-$(CONFIG_MEAN_AND_VARIANCE) += mean_and_variance.o
> >
> > obj-$(CONFIG_TEST_DIV64) += test_div64.o
> > obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o
> > +obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
> > diff --git a/fs/bcachefs/mean_and_variance.c b/lib/math/mean_and_variance.c
> > similarity index 99%
> > rename from fs/bcachefs/mean_and_variance.c
> > rename to lib/math/mean_and_variance.c
> > index bf0ef668fd38..ba90293204ba 100644
> > --- a/fs/bcachefs/mean_and_variance.c
> > +++ b/lib/math/mean_and_variance.c
> > @@ -40,10 +40,9 @@
> > #include <linux/limits.h>
> > #include <linux/math.h>
> > #include <linux/math64.h>
> > +#include <linux/mean_and_variance.h>
> > #include <linux/module.h>
> >
> > -#include "mean_and_variance.h"
> > -
> > u128_u u128_div(u128_u n, u64 d)
> > {
> > u128_u r;
> > diff --git a/fs/bcachefs/mean_and_variance_test.c b/lib/math/mean_and_variance_test.c
> > similarity index 99%
> > rename from fs/bcachefs/mean_and_variance_test.c
> > rename to lib/math/mean_and_variance_test.c
> > index 019583c3ca0e..f45591a169d8 100644
> > --- a/fs/bcachefs/mean_and_variance_test.c
> > +++ b/lib/math/mean_and_variance_test.c
> > @@ -1,7 +1,6 @@
> > // SPDX-License-Identifier: GPL-2.0
> > #include <kunit/test.h>
> > -
> > -#include "mean_and_variance.h"
> > +#include <linux/mean_and_variance.h>
> >
> > #define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
> >
> > --
> > 2.43.0
> >
> >