2022-08-29 17:36:57

by Kent Overstreet

[permalink] [raw]
Subject: Time stats library (from bcache/bcachefs) & generic latency instrumentation

Hey Coly, this is a fragment from a larger patch series - I wanted to send it
out to you now for review because it changes the bcache sysfs interface for
time_stats.

This series takes the time_stats code from bcachefs (which has some additional
improvements), moves it to lib/ converts bcache to use it (otherwise we get
clashes with the time_stats name) - and uses it for some really cool new latency
instrumentation.

These patches won't apply/build, full patch series for any interested is here:

https://evilpiepirate.org/git/bcachefs.git/log/?h=alloc_tags



2022-08-29 18:08:22

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 3/3] Code tagging based latency tracking

This adds the ability to easily instrument code for measuring latency.
To use, add the following to calls to your code, at the start and end of
the event you wish to measure:

code_tag_time_stats_start(start_time);
code_tag_time_stats_finish(start_time);

Stastistics will then show up in debugfs under
/sys/kernel/debug/time_stats, listed by file and line number.

Stastics measured include weighted averages of frequency, duration, max
duration, as well as quantiles.

This patch also instruments all calls to init_wait and finish_wait,
which includes all calls to wait_event. Example debugfs output:

fs/xfs/xfs_trans_ail.c:746 module:xfs func:xfs_ail_push_all_sync
count: 17
rate: 0/sec
frequency: 2 sec
avg duration: 10 us
max duration: 232 us
quantiles (ns): 128 128 128 128 128 128 128 128 128 128 128 128 128 128 128

lib/sbitmap.c:813 module:sbitmap func:sbitmap_finish_wait
count: 3
rate: 0/sec
frequency: 4 sec
avg duration: 4 sec
max duration: 4 sec
quantiles (ns): 0 4288669120 4288669120 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048

net/core/datagram.c:122 module:datagram func:__skb_wait_for_more_packets
count: 10
rate: 1/sec
frequency: 859 ms
avg duration: 472 ms
max duration: 30 sec
quantiles (ns): 0 12279 12279 15669 15669 15669 15669 17217 17217 17217 17217 17217 17217 17217 17217

Signed-off-by: Kent Overstreet <[email protected]>
---
include/asm-generic/codetag.lds.h | 3 +-
include/linux/codetag_time_stats.h | 54 +++++++++++
include/linux/wait.h | 22 ++++-
kernel/sched/wait.c | 6 +-
lib/Kconfig.debug | 8 ++
lib/Makefile | 1 +
lib/codetag_time_stats.c | 143 +++++++++++++++++++++++++++++
7 files changed, 232 insertions(+), 5 deletions(-)
create mode 100644 include/linux/codetag_time_stats.h
create mode 100644 lib/codetag_time_stats.c

diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
index 16fbf74edc..d799f4aced 100644
--- a/include/asm-generic/codetag.lds.h
+++ b/include/asm-generic/codetag.lds.h
@@ -10,6 +10,7 @@

#define CODETAG_SECTIONS() \
SECTION_WITH_BOUNDARIES(alloc_tags) \
- SECTION_WITH_BOUNDARIES(dynamic_fault_tags)
+ SECTION_WITH_BOUNDARIES(dynamic_fault_tags) \
+ SECTION_WITH_BOUNDARIES(time_stats_tags)

#endif /* __ASM_GENERIC_CODETAG_LDS_H */
diff --git a/include/linux/codetag_time_stats.h b/include/linux/codetag_time_stats.h
new file mode 100644
index 0000000000..7e44c7ee9e
--- /dev/null
+++ b/include/linux/codetag_time_stats.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CODETAG_TIMESTATS_H
+#define _LINUX_CODETAG_TIMESTATS_H
+
+/*
+ * Code tagging based latency tracking:
+ * (C) 2022 Kent Overstreet
+ *
+ * This allows you to easily instrument code to track latency, and have the
+ * results show up in debugfs. To use, add the following two calls to your code
+ * at the beginning and end of the event you wish to instrument:
+ *
+ * code_tag_time_stats_start(start_time);
+ * code_tag_time_stats_finish(start_time);
+ *
+ * Statistics will then show up in debugfs under /sys/kernel/debug/time_stats,
+ * listed by file and line number.
+ */
+
+#ifdef CONFIG_CODETAG_TIME_STATS
+
+#include <linux/codetag.h>
+#include <linux/time_stats.h>
+#include <linux/timekeeping.h>
+
+struct codetag_time_stats {
+ struct codetag tag;
+ struct time_stats stats;
+};
+
+#define codetag_time_stats_start(_start_time) u64 _start_time = ktime_get_ns()
+
+#define codetag_time_stats_finish(_start_time) \
+do { \
+ static struct codetag_time_stats \
+ __used \
+ __section("time_stats_tags") \
+ __aligned(8) s = { \
+ .tag = CODE_TAG_INIT, \
+ .stats.lock = __SPIN_LOCK_UNLOCKED(_lock) \
+ }; \
+ \
+ WARN_ONCE(!(_start_time), "codetag_time_stats_start() not called");\
+ time_stats_update(&s.stats, _start_time); \
+} while (0)
+
+#else
+
+#define codetag_time_stats_finish(_start_time) do {} while (0)
+#define codetag_time_stats_start(_start_time) do {} while (0)
+
+#endif /* CODETAG_CODETAG_TIME_STATS */
+
+#endif
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 91ced6a118..bab11b7ef1 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -4,6 +4,7 @@
/*
* Linux wait queue related types and methods
*/
+#include <linux/codetag_time_stats.h>
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>
@@ -32,6 +33,9 @@ struct wait_queue_entry {
void *private;
wait_queue_func_t func;
struct list_head entry;
+#ifdef CONFIG_CODETAG_TIME_STATS
+ u64 start_time;
+#endif
};

struct wait_queue_head {
@@ -79,10 +83,17 @@ extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *n
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
#endif

+#ifdef CONFIG_CODETAG_TIME_STATS
+#define WAIT_QUEUE_ENTRY_START_TIME_INITIALIZER .start_time = ktime_get_ns(),
+#else
+#define WAIT_QUEUE_ENTRY_START_TIME_INITIALIZER
+#endif
+
#define WAIT_FUNC_INITIALIZER(name, function) { \
.private = current, \
.func = function, \
.entry = LIST_HEAD_INIT((name).entry), \
+ WAIT_QUEUE_ENTRY_START_TIME_INITIALIZER \
}

#define DEFINE_WAIT_FUNC(name, function) \
@@ -98,6 +109,9 @@ __init_waitqueue_entry(struct wait_queue_entry *wq_entry, unsigned int flags,
wq_entry->private = private;
wq_entry->func = func;
INIT_LIST_HEAD(&wq_entry->entry);
+#ifdef CONFIG_CODETAG_TIME_STATS
+ wq_entry->start_time = ktime_get_ns();
+#endif
}

#define init_waitqueue_func_entry(_wq_entry, _func) \
@@ -1180,11 +1194,17 @@ do { \
void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
-void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+void __finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

+#define finish_wait(_wq_head, _wq_entry) \
+do { \
+ codetag_time_stats_finish((_wq_entry)->start_time); \
+ __finish_wait(_wq_head, _wq_entry); \
+} while (0)
+
typedef int (*task_call_f)(struct task_struct *p, void *arg);
extern int task_call_func(struct task_struct *p, task_call_f func, void *arg);

diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index b992234607..e88de3f0c3 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -367,7 +367,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
EXPORT_SYMBOL(do_wait_intr_irq);

/**
- * finish_wait - clean up after waiting in a queue
+ * __finish_wait - clean up after waiting in a queue
* @wq_head: waitqueue waited on
* @wq_entry: wait descriptor
*
@@ -375,7 +375,7 @@ EXPORT_SYMBOL(do_wait_intr_irq);
* the wait descriptor from the given waitqueue if still
* queued.
*/
-void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+void __finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;

@@ -399,7 +399,7 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en
spin_unlock_irqrestore(&wq_head->lock, flags);
}
}
-EXPORT_SYMBOL(finish_wait);
+EXPORT_SYMBOL(__finish_wait);

int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index bfb49505c9..40584fe54a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1731,6 +1731,14 @@ config LATENCYTOP
config TIME_STATS
bool

+config CODETAG_TIME_STATS
+ bool "Code tagging based latency measuring"
+ depends on DEBUG_FS
+ select TIME_STATS
+ select CODE_TAGGING
+ help
+ Enabling this option makes latency statistics available in debugfs
+
source "kernel/trace/Kconfig"

config PROVIDE_OHCI1394_DMA_INIT
diff --git a/lib/Makefile b/lib/Makefile
index e09255c881..7888bc8baa 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -233,6 +233,7 @@ obj-$(CONFIG_PAGE_ALLOC_TAGGING) += pgalloc_tag.o

obj-$(CONFIG_CODETAG_FAULT_INJECTION) += dynamic_fault.o
obj-$(CONFIG_TIME_STATS) += time_stats.o
+obj-$(CONFIG_CODETAG_TIME_STATS) += codetag_time_stats.o

lib-$(CONFIG_GENERIC_BUG) += bug.o

diff --git a/lib/codetag_time_stats.c b/lib/codetag_time_stats.c
new file mode 100644
index 0000000000..67f248906d
--- /dev/null
+++ b/lib/codetag_time_stats.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/codetag_time_stats.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/seq_buf.h>
+
+static struct codetag_type *cttype;
+
+struct user_buf {
+ char __user *buf; /* destination user buffer */
+ size_t size; /* size of requested read */
+ ssize_t ret; /* bytes read so far */
+};
+
+static int flush_ubuf(struct user_buf *dst, struct seq_buf *src)
+{
+ if (src->len) {
+ size_t bytes = min_t(size_t, src->len, dst->size);
+ int err = copy_to_user(dst->buf, src->buffer, bytes);
+
+ if (err)
+ return err;
+
+ dst->ret += bytes;
+ dst->buf += bytes;
+ dst->size -= bytes;
+ src->len -= bytes;
+ memmove(src->buffer, src->buffer + bytes, src->len);
+ }
+
+ return 0;
+}
+
+struct time_stats_iter {
+ struct codetag_iterator ct_iter;
+ struct seq_buf buf;
+ char rawbuf[4096];
+ bool first;
+};
+
+static int time_stats_open(struct inode *inode, struct file *file)
+{
+ struct time_stats_iter *iter;
+
+ pr_debug("called");
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ codetag_lock_module_list(cttype, true);
+ iter->ct_iter = codetag_get_ct_iter(cttype);
+ codetag_lock_module_list(cttype, false);
+
+ file->private_data = iter;
+ seq_buf_init(&iter->buf, iter->rawbuf, sizeof(iter->rawbuf));
+ iter->first = true;
+ return 0;
+}
+
+static int time_stats_release(struct inode *inode, struct file *file)
+{
+ struct time_stats_iter *i = file->private_data;
+
+ kfree(i);
+ return 0;
+}
+
+static ssize_t time_stats_read(struct file *file, char __user *ubuf,
+ size_t size, loff_t *ppos)
+{
+ struct time_stats_iter *iter = file->private_data;
+ struct user_buf buf = { .buf = ubuf, .size = size };
+ struct codetag_time_stats *s;
+ struct codetag *ct;
+ int err;
+
+ codetag_lock_module_list(iter->ct_iter.cttype, true);
+ while (1) {
+ err = flush_ubuf(&buf, &iter->buf);
+ if (err || !buf.size)
+ break;
+
+ ct = codetag_next_ct(&iter->ct_iter);
+ if (!ct)
+ break;
+
+ s = container_of(ct, struct codetag_time_stats, tag);
+ if (s->stats.count) {
+ if (!iter->first) {
+ seq_buf_putc(&iter->buf, '\n');
+ iter->first = true;
+ }
+
+ codetag_to_text(&iter->buf, &s->tag);
+ seq_buf_putc(&iter->buf, '\n');
+ time_stats_to_text(&iter->buf, &s->stats);
+ }
+ }
+ codetag_lock_module_list(iter->ct_iter.cttype, false);
+
+ return err ?: buf.ret;
+}
+
+static const struct file_operations time_stats_ops = {
+ .owner = THIS_MODULE,
+ .open = time_stats_open,
+ .release = time_stats_release,
+ .read = time_stats_read,
+};
+
+static void time_stats_module_unload(struct codetag_type *cttype, struct codetag_module *mod)
+{
+ struct codetag_time_stats *i, *start = (void *) mod->range.start;
+ struct codetag_time_stats *end = (void *) mod->range.stop;
+
+ for (i = start; i != end; i++)
+ time_stats_exit(&i->stats);
+}
+
+static int __init codetag_time_stats_init(void)
+{
+ const struct codetag_type_desc desc = {
+ .section = "time_stats_tags",
+ .tag_size = sizeof(struct codetag_time_stats),
+ .module_unload = time_stats_module_unload,
+ };
+ struct dentry *debugfs_file;
+
+ cttype = codetag_register_type(&desc);
+ if (IS_ERR_OR_NULL(cttype))
+ return PTR_ERR(cttype);
+
+ debugfs_file = debugfs_create_file("time_stats", 0666, NULL, NULL, &time_stats_ops);
+ if (IS_ERR(debugfs_file))
+ return PTR_ERR(debugfs_file);
+
+ return 0;
+}
+module_init(codetag_time_stats_init);
--
2.36.1

2022-08-29 18:08:22

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 2/3] bcache: Convert to lib/time_stats

This patch converts bcache to the new generic time_stats code
lib/time_stats.c. The new code is from bcachefs, and has some changes
from the version in bcache:

- we now use ktime_get_ns(), not local_clock(). When the code was
originally written multi processor systems that lacked synchronized
TSCs were still common, and so local_clock() was much cheaper than
sched_clock() (though not necessarily fully accurate, due to TSC
drift). ktime_get_ns() should be cheap enough on all common hardware
now, and more standard/correct.

- time_stats are now exported in a single file in sysfs, which means we
can improve the statistics we keep track of without changing all
users. This also means we don't have to manually specify which units
(ms, us, ns) a given time_stats should be printed in; that's handled
dynamically.

- There's a lazily-allocated percpu buffer, which now needs to be freed
with time_stats_exit().

Signed-off-by: Kent Overstreet <[email protected]>
Cc: Coly Li <[email protected]>
---
drivers/md/bcache/Kconfig | 1 +
drivers/md/bcache/bcache.h | 1 +
drivers/md/bcache/bset.c | 8 +++---
drivers/md/bcache/bset.h | 1 +
drivers/md/bcache/btree.c | 12 ++++----
drivers/md/bcache/super.c | 3 ++
drivers/md/bcache/sysfs.c | 43 ++++++++++++++++++++--------
drivers/md/bcache/util.c | 30 --------------------
drivers/md/bcache/util.h | 57 --------------------------------------
9 files changed, 47 insertions(+), 109 deletions(-)

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 529c9d04e9..8d165052e5 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -4,6 +4,7 @@ config BCACHE
tristate "Block device as cache"
select BLOCK_HOLDER_DEPRECATED if SYSFS
select CRC64
+ select TIME_STATS
help
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 2acda9cea0..5100010a38 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -185,6 +185,7 @@
#include <linux/rbtree.h>
#include <linux/rwsem.h>
#include <linux/refcount.h>
+#include <linux/time_stats.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 94d38e8a59..727e9b7aea 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1251,7 +1251,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
order = state->page_order;
}

- start_time = local_clock();
+ start_time = ktime_get_ns();

btree_mergesort(b, out, iter, fixup, false);
b->nsets = start;
@@ -1286,7 +1286,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
bch_bset_build_written_tree(b);

if (!start)
- bch_time_stats_update(&state->time, start_time);
+ time_stats_update(&state->time, start_time);
}

void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
@@ -1322,14 +1322,14 @@ void bch_btree_sort_and_fix_extents(struct btree_keys *b,
void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
struct bset_sort_state *state)
{
- uint64_t start_time = local_clock();
+ uint64_t start_time = ktime_get_ns();
struct btree_iter iter;

bch_btree_iter_init(b, &iter, NULL);

btree_mergesort(b, new->set->data, &iter, false, true);

- bch_time_stats_update(&state->time, start_time);
+ time_stats_update(&state->time, start_time);

new->set->size = 0; // XXX: why?
}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index d795c84246..13e524ad77 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -3,6 +3,7 @@
#define _BCACHE_BSET_H

#include <linux/kernel.h>
+#include <linux/time_stats.h>
#include <linux/types.h>

#include "bcache_ondisk.h"
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 147c493a98..abf543bc75 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -242,7 +242,7 @@ static void btree_node_read_endio(struct bio *bio)

static void bch_btree_node_read(struct btree *b)
{
- uint64_t start_time = local_clock();
+ uint64_t start_time = ktime_get_ns();
struct closure cl;
struct bio *bio;

@@ -270,7 +270,7 @@ static void bch_btree_node_read(struct btree *b)
goto err;

bch_btree_node_read_done(b);
- bch_time_stats_update(&b->c->btree_read_time, start_time);
+ time_stats_update(&b->c->btree_read_time, start_time);

return;
err:
@@ -1789,7 +1789,7 @@ static void bch_btree_gc(struct cache_set *c)
struct gc_stat stats;
struct closure writes;
struct btree_op op;
- uint64_t start_time = local_clock();
+ uint64_t start_time = ktime_get_ns();

trace_bcache_gc_start(c);

@@ -1815,7 +1815,7 @@ static void bch_btree_gc(struct cache_set *c)
bch_btree_gc_finish(c);
wake_up_allocators(c);

- bch_time_stats_update(&c->btree_gc_time, start_time);
+ time_stats_update(&c->btree_gc_time, start_time);

stats.key_bytes *= sizeof(uint64_t);
stats.data <<= 9;
@@ -2191,7 +2191,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
{
bool split;
struct btree *n1, *n2 = NULL, *n3 = NULL;
- uint64_t start_time = local_clock();
+ uint64_t start_time = ktime_get_ns();
struct closure cl;
struct keylist parent_keys;

@@ -2297,7 +2297,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
btree_node_free(b);
rw_unlock(true, n1);

- bch_time_stats_update(&b->c->btree_split_time, start_time);
+ time_stats_update(&b->c->btree_split_time, start_time);

return 0;
err_free2:
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index ba3909bb6b..26c8fa93b5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1691,6 +1691,9 @@ static void cache_set_free(struct closure *cl)
kobject_put(&ca->kobj);
}

+ time_stats_exit(&c->btree_gc_time);
+ time_stats_exit(&c->btree_split_time);
+ time_stats_exit(&c->sort.time);

if (c->moving_gc_wq)
destroy_workqueue(c->moving_gc_wq);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index c6f6770592..01eec5877c 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,6 +16,7 @@
#include <linux/blkdev.h>
#include <linux/sort.h>
#include <linux/sched/clock.h>
+#include <linux/seq_buf.h>

extern bool bcache_is_reboot;

@@ -79,10 +80,10 @@ read_attribute(active_journal_entries);
read_attribute(backing_dev_name);
read_attribute(backing_dev_uuid);

-sysfs_time_stats_attribute(btree_gc, sec, ms);
-sysfs_time_stats_attribute(btree_split, sec, us);
-sysfs_time_stats_attribute(btree_sort, ms, us);
-sysfs_time_stats_attribute(btree_read, ms, us);
+read_attribute(btree_gc_time);
+read_attribute(btree_split_time);
+read_attribute(btree_sort_time);
+read_attribute(btree_read_time);

read_attribute(btree_nodes);
read_attribute(btree_used_percent);
@@ -731,6 +732,9 @@ static unsigned int bch_average_key_size(struct cache_set *c)
SHOW(__bch_cache_set)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+ struct seq_buf s;
+
+ seq_buf_init(&s, buf, PAGE_SIZE);

sysfs_print(synchronous, CACHE_SYNC(&c->cache->sb));
sysfs_print(journal_delay_ms, c->journal_delay_ms);
@@ -743,10 +747,25 @@ SHOW(__bch_cache_set)
sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c));
sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);

- sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
- sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
- sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
- sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
+ if (attr == &sysfs_btree_gc_time) {
+ time_stats_to_text(&s, &c->btree_gc_time);
+ return s.len;
+ }
+
+ if (attr == &sysfs_btree_split_time) {
+ time_stats_to_text(&s, &c->btree_split_time);
+ return s.len;
+ }
+
+ if (attr == &sysfs_btree_sort_time) {
+ time_stats_to_text(&s, &c->sort.time);
+ return s.len;
+ }
+
+ if (attr == &sysfs_btree_read_time) {
+ time_stats_to_text(&s, &c->btree_read_time);
+ return s.len;
+ }

sysfs_print(btree_used_percent, bch_btree_used(c));
sysfs_print(btree_nodes, c->gc_stats.nodes);
@@ -988,10 +1007,10 @@ KTYPE(bch_cache_set);
static struct attribute *bch_cache_set_internal_attrs[] = {
&sysfs_active_journal_entries,

- sysfs_time_stats_attribute_list(btree_gc, sec, ms)
- sysfs_time_stats_attribute_list(btree_split, sec, us)
- sysfs_time_stats_attribute_list(btree_sort, ms, us)
- sysfs_time_stats_attribute_list(btree_read, ms, us)
+ &sysfs_btree_gc_time,
+ &sysfs_btree_split_time,
+ &sysfs_btree_sort_time,
+ &sysfs_btree_read_time,

&sysfs_btree_nodes,
&sysfs_btree_used_percent,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index ae380bc399..95282bf0f9 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -160,36 +160,6 @@ int bch_parse_uuid(const char *s, char *uuid)
return i;
}

-void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
-{
- uint64_t now, duration, last;
-
- spin_lock(&stats->lock);
-
- now = local_clock();
- duration = time_after64(now, start_time)
- ? now - start_time : 0;
- last = time_after64(now, stats->last)
- ? now - stats->last : 0;
-
- stats->max_duration = max(stats->max_duration, duration);
-
- if (stats->last) {
- ewma_add(stats->average_duration, duration, 8, 8);
-
- if (stats->average_frequency)
- ewma_add(stats->average_frequency, last, 8, 8);
- else
- stats->average_frequency = last << 8;
- } else {
- stats->average_duration = duration << 8;
- }
-
- stats->last = now ?: 1;
-
- spin_unlock(&stats->lock);
-}
-
/**
* bch_next_delay() - update ratelimiting statistics and calculate next delay
* @d: the struct bch_ratelimit to update
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 6f3cb7c921..1e1bdbae95 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -345,68 +345,11 @@ ssize_t bch_hprint(char *buf, int64_t v);
bool bch_is_zero(const char *p, size_t n);
int bch_parse_uuid(const char *s, char *uuid);

-struct time_stats {
- spinlock_t lock;
- /*
- * all fields are in nanoseconds, averages are ewmas stored left shifted
- * by 8
- */
- uint64_t max_duration;
- uint64_t average_duration;
- uint64_t average_frequency;
- uint64_t last;
-};
-
-void bch_time_stats_update(struct time_stats *stats, uint64_t time);
-
static inline unsigned int local_clock_us(void)
{
return local_clock() >> 10;
}

-#define NSEC_PER_ns 1L
-#define NSEC_PER_us NSEC_PER_USEC
-#define NSEC_PER_ms NSEC_PER_MSEC
-#define NSEC_PER_sec NSEC_PER_SEC
-
-#define __print_time_stat(stats, name, stat, units) \
- sysfs_print(name ## _ ## stat ## _ ## units, \
- div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
-
-#define sysfs_print_time_stats(stats, name, \
- frequency_units, \
- duration_units) \
-do { \
- __print_time_stat(stats, name, \
- average_frequency, frequency_units); \
- __print_time_stat(stats, name, \
- average_duration, duration_units); \
- sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
- div_u64((stats)->max_duration, \
- NSEC_PER_ ## duration_units)); \
- \
- sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
- ? div_s64(local_clock() - (stats)->last, \
- NSEC_PER_ ## frequency_units) \
- : -1LL); \
-} while (0)
-
-#define sysfs_time_stats_attribute(name, \
- frequency_units, \
- duration_units) \
-read_attribute(name ## _average_frequency_ ## frequency_units); \
-read_attribute(name ## _average_duration_ ## duration_units); \
-read_attribute(name ## _max_duration_ ## duration_units); \
-read_attribute(name ## _last_ ## frequency_units)
-
-#define sysfs_time_stats_attribute_list(name, \
- frequency_units, \
- duration_units) \
-&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
-&sysfs_ ## name ## _average_duration_ ## duration_units, \
-&sysfs_ ## name ## _max_duration_ ## duration_units, \
-&sysfs_ ## name ## _last_ ## frequency_units,
-
#define ewma_add(ewma, val, weight, factor) \
({ \
(ewma) *= (weight) - 1; \
--
2.36.1

2022-08-29 18:24:24

by Kent Overstreet

[permalink] [raw]
Subject: [PATCH 1/3] lib/time_stats: New library for statistics on events

This adds a small new library for tracking statistics on events that
have a duration, i.e. a start and end time.

- number of events
- rate/frequency
- average duration
- max duration
- duration quantiles

This code comes from bcachefs, and originally bcache: the next patch
will be converting bcache to use this version, and a subsequent patch
will be using code_tagging to instrument all wait_event() calls in the
kernel.

Signed-off-by: Kent Overstreet <[email protected]>
---
include/linux/time_stats.h | 44 +++++++
lib/Kconfig.debug | 3 +
lib/Makefile | 1 +
lib/time_stats.c | 236 +++++++++++++++++++++++++++++++++++++
4 files changed, 284 insertions(+)
create mode 100644 include/linux/time_stats.h
create mode 100644 lib/time_stats.c

diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
new file mode 100644
index 0000000000..7ae929e6f8
--- /dev/null
+++ b/include/linux/time_stats.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_TIMESTATS_H
+#define _LINUX_TIMESTATS_H
+
+#include <linux/spinlock_types.h>
+#include <linux/types.h>
+
+#define NR_QUANTILES 15
+
+struct quantiles {
+ struct quantile_entry {
+ u64 m;
+ u64 step;
+ } entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+ unsigned int nr;
+ struct time_stat_buffer_entry {
+ u64 start;
+ u64 end;
+ } entries[32];
+};
+
+struct time_stats {
+ spinlock_t lock;
+ u64 count;
+ /* all fields are in nanoseconds */
+ u64 average_duration;
+ u64 average_frequency;
+ u64 max_duration;
+ u64 last_event;
+ struct quantiles quantiles;
+
+ struct time_stat_buffer __percpu *buffer;
+};
+
+struct seq_buf;
+void time_stats_update(struct time_stats *stats, u64 start);
+void time_stats_to_text(struct seq_buf *out, struct time_stats *stats);
+void time_stats_exit(struct time_stats *stats);
+
+#endif /* _LINUX_TIMESTATS_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index bbe3ef939c..bfb49505c9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1728,6 +1728,9 @@ config LATENCYTOP
Enable this option if you want to use the LatencyTOP tool
to find out which userspace is blocking on what kernel operations.

+config TIME_STATS
+ bool
+
source "kernel/trace/Kconfig"

config PROVIDE_OHCI1394_DMA_INIT
diff --git a/lib/Makefile b/lib/Makefile
index c69430213e..e09255c881 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -232,6 +232,7 @@ obj-$(CONFIG_ALLOC_TAGGING) += alloc_tag.o
obj-$(CONFIG_PAGE_ALLOC_TAGGING) += pgalloc_tag.o

obj-$(CONFIG_CODETAG_FAULT_INJECTION) += dynamic_fault.o
+obj-$(CONFIG_TIME_STATS) += time_stats.o

lib-$(CONFIG_GENERIC_BUG) += bug.o

diff --git a/lib/time_stats.c b/lib/time_stats.c
new file mode 100644
index 0000000000..30362364fd
--- /dev/null
+++ b/lib/time_stats.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/gfp.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/percpu.h>
+#include <linux/seq_buf.h>
+#include <linux/spinlock.h>
+#include <linux/time_stats.h>
+#include <linux/timekeeping.h>
+
+static inline unsigned int eytzinger1_child(unsigned int i, unsigned int child)
+{
+ return (i << 1) + child;
+}
+
+static inline unsigned int eytzinger1_right_child(unsigned int i)
+{
+ return eytzinger1_child(i, 1);
+}
+
+static inline unsigned int eytzinger1_next(unsigned int i, unsigned int size)
+{
+ if (eytzinger1_right_child(i) <= size) {
+ i = eytzinger1_right_child(i);
+
+ i <<= __fls(size + 1) - __fls(i);
+ i >>= i > size;
+ } else {
+ i >>= ffz(i) + 1;
+ }
+
+ return i;
+}
+
+static inline unsigned int eytzinger0_child(unsigned int i, unsigned int child)
+{
+ return (i << 1) + 1 + child;
+}
+
+static inline unsigned int eytzinger0_first(unsigned int size)
+{
+ return rounddown_pow_of_two(size) - 1;
+}
+
+static inline unsigned int eytzinger0_next(unsigned int i, unsigned int size)
+{
+ return eytzinger1_next(i + 1, size) - 1;
+}
+
+#define eytzinger0_for_each(_i, _size) \
+ for ((_i) = eytzinger0_first((_size)); \
+ (_i) != -1; \
+ (_i) = eytzinger0_next((_i), (_size)))
+
+#define ewma_add(ewma, val, weight) \
+({ \
+ typeof(ewma) _ewma = (ewma); \
+ typeof(weight) _weight = (weight); \
+ \
+ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \
+})
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+ unsigned int i = 0;
+
+ while (i < ARRAY_SIZE(q->entries)) {
+ struct quantile_entry *e = q->entries + i;
+
+ if (unlikely(!e->step)) {
+ e->m = v;
+ e->step = max_t(unsigned int, v / 2, 1024);
+ } else if (e->m > v) {
+ e->m = e->m >= e->step
+ ? e->m - e->step
+ : 0;
+ } else if (e->m < v) {
+ e->m = e->m + e->step > e->m
+ ? e->m + e->step
+ : U32_MAX;
+ }
+
+ if ((e->m > v ? e->m - v : v - e->m) < e->step)
+ e->step = max_t(unsigned int, e->step / 2, 1);
+
+ if (v >= e->m)
+ break;
+
+ i = eytzinger0_child(i, v > e->m);
+ }
+}
+
+static void time_stats_update_one(struct time_stats *stats,
+ u64 start, u64 end)
+{
+ u64 duration, freq;
+
+ duration = time_after64(end, start)
+ ? end - start : 0;
+ freq = time_after64(end, stats->last_event)
+ ? end - stats->last_event : 0;
+
+ stats->count++;
+
+ stats->average_duration = stats->average_duration
+ ? ewma_add(stats->average_duration, duration, 6)
+ : duration;
+
+ stats->average_frequency = stats->average_frequency
+ ? ewma_add(stats->average_frequency, freq, 6)
+ : freq;
+
+ stats->max_duration = max(stats->max_duration, duration);
+
+ stats->last_event = end;
+
+ quantiles_update(&stats->quantiles, duration);
+}
+
+void time_stats_update(struct time_stats *stats, u64 start)
+{
+ u64 end = ktime_get_ns();
+ unsigned long flags;
+
+ if (!stats->buffer) {
+ spin_lock_irqsave(&stats->lock, flags);
+ time_stats_update_one(stats, start, end);
+
+ if (stats->average_frequency < 32 &&
+ stats->count > 1024)
+ stats->buffer =
+ alloc_percpu_gfp(struct time_stat_buffer,
+ GFP_ATOMIC);
+ spin_unlock_irqrestore(&stats->lock, flags);
+ } else {
+ struct time_stat_buffer_entry *i;
+ struct time_stat_buffer *b;
+
+ preempt_disable();
+ b = this_cpu_ptr(stats->buffer);
+
+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+ b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+ .start = start,
+ .end = end
+ };
+
+ if (b->nr == ARRAY_SIZE(b->entries)) {
+ spin_lock_irqsave(&stats->lock, flags);
+ for (i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ time_stats_update_one(stats, i->start, i->end);
+ spin_unlock_irqrestore(&stats->lock, flags);
+
+ b->nr = 0;
+ }
+
+ preempt_enable();
+ }
+}
+EXPORT_SYMBOL(time_stats_update);
+
+static const struct time_unit {
+ const char *name;
+ u32 nsecs;
+} time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "sec", NSEC_PER_SEC },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+static void pr_time_units(struct seq_buf *out, u64 ns)
+{
+ const struct time_unit *u = pick_time_units(ns);
+
+ seq_buf_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
+void time_stats_to_text(struct seq_buf *out, struct time_stats *stats)
+{
+ const struct time_unit *u;
+ u64 freq = READ_ONCE(stats->average_frequency);
+ u64 q, last_q = 0;
+ int i;
+
+ seq_buf_printf(out, "count: %llu\n", stats->count);
+ seq_buf_printf(out, "rate: %llu/sec\n",
+ freq ? div64_u64(NSEC_PER_SEC, freq) : 0);
+ seq_buf_printf(out, "frequency: ");
+ pr_time_units(out, freq);
+ seq_buf_putc(out, '\n');
+
+ seq_buf_printf(out, "avg duration: ");
+ pr_time_units(out, stats->average_duration);
+ seq_buf_putc(out, '\n');
+
+ seq_buf_printf(out, "max duration: ");
+ pr_time_units(out, stats->max_duration);
+ seq_buf_putc(out, '\n');
+
+ i = eytzinger0_first(NR_QUANTILES);
+ u = pick_time_units(stats->quantiles.entries[i].m);
+ seq_buf_printf(out, "quantiles (%s): ", u->name);
+ eytzinger0_for_each(i, NR_QUANTILES) {
+ q = max(stats->quantiles.entries[i].m, last_q);
+ seq_buf_printf(out, "%llu ", div_u64(q, u->nsecs));
+ last_q = q;
+ }
+
+ seq_buf_putc(out, '\n');
+}
+EXPORT_SYMBOL_GPL(time_stats_to_text);
+
+void time_stats_exit(struct time_stats *stats)
+{
+ free_percpu(stats->buffer);
+ stats->buffer = NULL;
+}
+EXPORT_SYMBOL_GPL(time_stats_exit);
--
2.36.1

2022-08-30 00:02:55

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH 1/3] lib/time_stats: New library for statistics on events



On 8/29/22 09:53, Kent Overstreet wrote:
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index bbe3ef939c..bfb49505c9 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -1728,6 +1728,9 @@ config LATENCYTOP
> Enable this option if you want to use the LatencyTOP tool
> to find out which userspace is blocking on what kernel operations.
>
> +config TIME_STATS
> + bool
> +

Hi Kent,

Why not just in lib/Kconfig?

thanks.
--
~Randy

2022-08-30 00:27:02

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 1/3] lib/time_stats: New library for statistics on events

On Mon, Aug 29, 2022 at 04:34:14PM -0700, Randy Dunlap wrote:
>
>
> On 8/29/22 09:53, Kent Overstreet wrote:
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index bbe3ef939c..bfb49505c9 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -1728,6 +1728,9 @@ config LATENCYTOP
> > Enable this option if you want to use the LatencyTOP tool
> > to find out which userspace is blocking on what kernel operations.
> >
> > +config TIME_STATS
> > + bool
> > +
>
> Hi Kent,
>
> Why not just in lib/Kconfig?

Probably just lazyness, I'll move it there :)

2022-08-31 16:26:16

by Coly Li

[permalink] [raw]
Subject: Re: [PATCH 2/3] bcache: Convert to lib/time_stats



> 2022年8月30日 00:53,Kent Overstreet <[email protected]> 写道:
>
> This patch converts bcache to the new generic time_stats code
> lib/time_stats.c. The new code is from bcachefs, and has some changes
> from the version in bcache:
>
> - we now use ktime_get_ns(), not local_clock(). When the code was
> originally written multi processor systems that lacked synchronized
> TSCs were still common, and so local_clock() was much cheaper than
> sched_clock() (though not necessarily fully accurate, due to TSC
> drift). ktime_get_ns() should be cheap enough on all common hardware
> now, and more standard/correct.
>
> - time_stats are now exported in a single file in sysfs, which means we
> can improve the statistics we keep track of without changing all
> users. This also means we don't have to manually specify which units
> (ms, us, ns) a given time_stats should be printed in; that's handled
> dynamically.
>
> - There's a lazily-allocated percpu buffer, which now needs to be freed
> with time_stats_exit().
>
> Signed-off-by: Kent Overstreet <[email protected]>
> Cc: Coly Li <[email protected]>

Hi Kent,

Overall I am OK with the change to bcache code. You may add
Acked-by: Coly Li <[email protected]>
in future version of this patch.

In bcache-tools, they don’t read the changed sysfs files (including bcache-status), IMHO changing the output format won’t be problem for upstream.

My only question is, how to understand the time_stats_to_text() output format,
count: 3
rate: 0/sec
frequency: 4 sec
avg duration: 4 sec
max duration: 4 sec
quantiles (ns): 0 4288669120 4288669120 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048

Fro the above output, what are “rate”, “frequence” and “quantiles” for?

Thanks.

Coly Li

> ---
> drivers/md/bcache/Kconfig | 1 +
> drivers/md/bcache/bcache.h | 1 +
> drivers/md/bcache/bset.c | 8 +++---
> drivers/md/bcache/bset.h | 1 +
> drivers/md/bcache/btree.c | 12 ++++----
> drivers/md/bcache/super.c | 3 ++
> drivers/md/bcache/sysfs.c | 43 ++++++++++++++++++++--------
> drivers/md/bcache/util.c | 30 --------------------
> drivers/md/bcache/util.h | 57 --------------------------------------
> 9 files changed, 47 insertions(+), 109 deletions(-)
>
> diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
> index 529c9d04e9..8d165052e5 100644
> --- a/drivers/md/bcache/Kconfig
> +++ b/drivers/md/bcache/Kconfig
> @@ -4,6 +4,7 @@ config BCACHE
> tristate "Block device as cache"
> select BLOCK_HOLDER_DEPRECATED if SYSFS
> select CRC64
> + select TIME_STATS
> help
> Allows a block device to be used as cache for other devices; uses
> a btree for indexing and the layout is optimized for SSDs.
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 2acda9cea0..5100010a38 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -185,6 +185,7 @@
> #include <linux/rbtree.h>
> #include <linux/rwsem.h>
> #include <linux/refcount.h>
> +#include <linux/time_stats.h>
> #include <linux/types.h>
> #include <linux/workqueue.h>
> #include <linux/kthread.h>
> diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
> index 94d38e8a59..727e9b7aea 100644
> --- a/drivers/md/bcache/bset.c
> +++ b/drivers/md/bcache/bset.c
> @@ -1251,7 +1251,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
> order = state->page_order;
> }
>
> - start_time = local_clock();
> + start_time = ktime_get_ns();
>
> btree_mergesort(b, out, iter, fixup, false);
> b->nsets = start;
> @@ -1286,7 +1286,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
> bch_bset_build_written_tree(b);
>
> if (!start)
> - bch_time_stats_update(&state->time, start_time);
> + time_stats_update(&state->time, start_time);
> }
>
> void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
> @@ -1322,14 +1322,14 @@ void bch_btree_sort_and_fix_extents(struct btree_keys *b,
> void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
> struct bset_sort_state *state)
> {
> - uint64_t start_time = local_clock();
> + uint64_t start_time = ktime_get_ns();
> struct btree_iter iter;
>
> bch_btree_iter_init(b, &iter, NULL);
>
> btree_mergesort(b, new->set->data, &iter, false, true);
>
> - bch_time_stats_update(&state->time, start_time);
> + time_stats_update(&state->time, start_time);
>
> new->set->size = 0; // XXX: why?
> }
> diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
> index d795c84246..13e524ad77 100644
> --- a/drivers/md/bcache/bset.h
> +++ b/drivers/md/bcache/bset.h
> @@ -3,6 +3,7 @@
> #define _BCACHE_BSET_H
>
> #include <linux/kernel.h>
> +#include <linux/time_stats.h>
> #include <linux/types.h>
>
> #include "bcache_ondisk.h"
> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> index 147c493a98..abf543bc75 100644
> --- a/drivers/md/bcache/btree.c
> +++ b/drivers/md/bcache/btree.c
> @@ -242,7 +242,7 @@ static void btree_node_read_endio(struct bio *bio)
>
> static void bch_btree_node_read(struct btree *b)
> {
> - uint64_t start_time = local_clock();
> + uint64_t start_time = ktime_get_ns();
> struct closure cl;
> struct bio *bio;
>
> @@ -270,7 +270,7 @@ static void bch_btree_node_read(struct btree *b)
> goto err;
>
> bch_btree_node_read_done(b);
> - bch_time_stats_update(&b->c->btree_read_time, start_time);
> + time_stats_update(&b->c->btree_read_time, start_time);
>
> return;
> err:
> @@ -1789,7 +1789,7 @@ static void bch_btree_gc(struct cache_set *c)
> struct gc_stat stats;
> struct closure writes;
> struct btree_op op;
> - uint64_t start_time = local_clock();
> + uint64_t start_time = ktime_get_ns();
>
> trace_bcache_gc_start(c);
>
> @@ -1815,7 +1815,7 @@ static void bch_btree_gc(struct cache_set *c)
> bch_btree_gc_finish(c);
> wake_up_allocators(c);
>
> - bch_time_stats_update(&c->btree_gc_time, start_time);
> + time_stats_update(&c->btree_gc_time, start_time);
>
> stats.key_bytes *= sizeof(uint64_t);
> stats.data <<= 9;
> @@ -2191,7 +2191,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
> {
> bool split;
> struct btree *n1, *n2 = NULL, *n3 = NULL;
> - uint64_t start_time = local_clock();
> + uint64_t start_time = ktime_get_ns();
> struct closure cl;
> struct keylist parent_keys;
>
> @@ -2297,7 +2297,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
> btree_node_free(b);
> rw_unlock(true, n1);
>
> - bch_time_stats_update(&b->c->btree_split_time, start_time);
> + time_stats_update(&b->c->btree_split_time, start_time);
>
> return 0;
> err_free2:
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index ba3909bb6b..26c8fa93b5 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -1691,6 +1691,9 @@ static void cache_set_free(struct closure *cl)
> kobject_put(&ca->kobj);
> }
>
> + time_stats_exit(&c->btree_gc_time);
> + time_stats_exit(&c->btree_split_time);
> + time_stats_exit(&c->sort.time);
>
> if (c->moving_gc_wq)
> destroy_workqueue(c->moving_gc_wq);
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index c6f6770592..01eec5877c 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -16,6 +16,7 @@
> #include <linux/blkdev.h>
> #include <linux/sort.h>
> #include <linux/sched/clock.h>
> +#include <linux/seq_buf.h>
>
> extern bool bcache_is_reboot;
>
> @@ -79,10 +80,10 @@ read_attribute(active_journal_entries);
> read_attribute(backing_dev_name);
> read_attribute(backing_dev_uuid);
>
> -sysfs_time_stats_attribute(btree_gc, sec, ms);
> -sysfs_time_stats_attribute(btree_split, sec, us);
> -sysfs_time_stats_attribute(btree_sort, ms, us);
> -sysfs_time_stats_attribute(btree_read, ms, us);
> +read_attribute(btree_gc_time);
> +read_attribute(btree_split_time);
> +read_attribute(btree_sort_time);
> +read_attribute(btree_read_time);
>
> read_attribute(btree_nodes);
> read_attribute(btree_used_percent);
> @@ -731,6 +732,9 @@ static unsigned int bch_average_key_size(struct cache_set *c)
> SHOW(__bch_cache_set)
> {
> struct cache_set *c = container_of(kobj, struct cache_set, kobj);
> + struct seq_buf s;
> +
> + seq_buf_init(&s, buf, PAGE_SIZE);
>
> sysfs_print(synchronous, CACHE_SYNC(&c->cache->sb));
> sysfs_print(journal_delay_ms, c->journal_delay_ms);
> @@ -743,10 +747,25 @@ SHOW(__bch_cache_set)
> sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c));
> sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use);
>
> - sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms);
> - sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us);
> - sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us);
> - sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us);
> + if (attr == &sysfs_btree_gc_time) {
> + time_stats_to_text(&s, &c->btree_gc_time);
> + return s.len;
> + }
> +
> + if (attr == &sysfs_btree_split_time) {
> + time_stats_to_text(&s, &c->btree_split_time);
> + return s.len;
> + }
> +
> + if (attr == &sysfs_btree_sort_time) {
> + time_stats_to_text(&s, &c->sort.time);
> + return s.len;
> + }
> +
> + if (attr == &sysfs_btree_read_time) {
> + time_stats_to_text(&s, &c->btree_read_time);
> + return s.len;
> + }
>
> sysfs_print(btree_used_percent, bch_btree_used(c));
> sysfs_print(btree_nodes, c->gc_stats.nodes);
> @@ -988,10 +1007,10 @@ KTYPE(bch_cache_set);
> static struct attribute *bch_cache_set_internal_attrs[] = {
> &sysfs_active_journal_entries,
>
> - sysfs_time_stats_attribute_list(btree_gc, sec, ms)
> - sysfs_time_stats_attribute_list(btree_split, sec, us)
> - sysfs_time_stats_attribute_list(btree_sort, ms, us)
> - sysfs_time_stats_attribute_list(btree_read, ms, us)
> + &sysfs_btree_gc_time,
> + &sysfs_btree_split_time,
> + &sysfs_btree_sort_time,
> + &sysfs_btree_read_time,
>
> &sysfs_btree_nodes,
> &sysfs_btree_used_percent,
> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
> index ae380bc399..95282bf0f9 100644
> --- a/drivers/md/bcache/util.c
> +++ b/drivers/md/bcache/util.c
> @@ -160,36 +160,6 @@ int bch_parse_uuid(const char *s, char *uuid)
> return i;
> }
>
> -void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
> -{
> - uint64_t now, duration, last;
> -
> - spin_lock(&stats->lock);
> -
> - now = local_clock();
> - duration = time_after64(now, start_time)
> - ? now - start_time : 0;
> - last = time_after64(now, stats->last)
> - ? now - stats->last : 0;
> -
> - stats->max_duration = max(stats->max_duration, duration);
> -
> - if (stats->last) {
> - ewma_add(stats->average_duration, duration, 8, 8);
> -
> - if (stats->average_frequency)
> - ewma_add(stats->average_frequency, last, 8, 8);
> - else
> - stats->average_frequency = last << 8;
> - } else {
> - stats->average_duration = duration << 8;
> - }
> -
> - stats->last = now ?: 1;
> -
> - spin_unlock(&stats->lock);
> -}
> -
> /**
> * bch_next_delay() - update ratelimiting statistics and calculate next delay
> * @d: the struct bch_ratelimit to update
> diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
> index 6f3cb7c921..1e1bdbae95 100644
> --- a/drivers/md/bcache/util.h
> +++ b/drivers/md/bcache/util.h
> @@ -345,68 +345,11 @@ ssize_t bch_hprint(char *buf, int64_t v);
> bool bch_is_zero(const char *p, size_t n);
> int bch_parse_uuid(const char *s, char *uuid);
>
> -struct time_stats {
> - spinlock_t lock;
> - /*
> - * all fields are in nanoseconds, averages are ewmas stored left shifted
> - * by 8
> - */
> - uint64_t max_duration;
> - uint64_t average_duration;
> - uint64_t average_frequency;
> - uint64_t last;
> -};
> -
> -void bch_time_stats_update(struct time_stats *stats, uint64_t time);
> -
> static inline unsigned int local_clock_us(void)
> {
> return local_clock() >> 10;
> }
>
> -#define NSEC_PER_ns 1L
> -#define NSEC_PER_us NSEC_PER_USEC
> -#define NSEC_PER_ms NSEC_PER_MSEC
> -#define NSEC_PER_sec NSEC_PER_SEC
> -
> -#define __print_time_stat(stats, name, stat, units) \
> - sysfs_print(name ## _ ## stat ## _ ## units, \
> - div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
> -
> -#define sysfs_print_time_stats(stats, name, \
> - frequency_units, \
> - duration_units) \
> -do { \
> - __print_time_stat(stats, name, \
> - average_frequency, frequency_units); \
> - __print_time_stat(stats, name, \
> - average_duration, duration_units); \
> - sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \
> - div_u64((stats)->max_duration, \
> - NSEC_PER_ ## duration_units)); \
> - \
> - sysfs_print(name ## _last_ ## frequency_units, (stats)->last \
> - ? div_s64(local_clock() - (stats)->last, \
> - NSEC_PER_ ## frequency_units) \
> - : -1LL); \
> -} while (0)
> -
> -#define sysfs_time_stats_attribute(name, \
> - frequency_units, \
> - duration_units) \
> -read_attribute(name ## _average_frequency_ ## frequency_units); \
> -read_attribute(name ## _average_duration_ ## duration_units); \
> -read_attribute(name ## _max_duration_ ## duration_units); \
> -read_attribute(name ## _last_ ## frequency_units)
> -
> -#define sysfs_time_stats_attribute_list(name, \
> - frequency_units, \
> - duration_units) \
> -&sysfs_ ## name ## _average_frequency_ ## frequency_units, \
> -&sysfs_ ## name ## _average_duration_ ## duration_units, \
> -&sysfs_ ## name ## _max_duration_ ## duration_units, \
> -&sysfs_ ## name ## _last_ ## frequency_units,
> -
> #define ewma_add(ewma, val, weight, factor) \
> ({ \
> (ewma) *= (weight) - 1; \
> --
> 2.36.1
>

2022-08-31 17:06:56

by Coly Li

[permalink] [raw]
Subject: Re: [PATCH 2/3] bcache: Convert to lib/time_stats



> 2022年9月1日 00:54,Kent Overstreet <[email protected]> 写道:
>
> On Thu, Sep 01, 2022 at 12:00:17AM +0800, Coly Li wrote:
>>
>>
>>> 2022年8月30日 00:53,Kent Overstreet <[email protected]> 写道:
>>>
>>> This patch converts bcache to the new generic time_stats code
>>> lib/time_stats.c. The new code is from bcachefs, and has some changes
>>> from the version in bcache:
>>>
>>> - we now use ktime_get_ns(), not local_clock(). When the code was
>>> originally written multi processor systems that lacked synchronized
>>> TSCs were still common, and so local_clock() was much cheaper than
>>> sched_clock() (though not necessarily fully accurate, due to TSC
>>> drift). ktime_get_ns() should be cheap enough on all common hardware
>>> now, and more standard/correct.
>>>
>>> - time_stats are now exported in a single file in sysfs, which means we
>>> can improve the statistics we keep track of without changing all
>>> users. This also means we don't have to manually specify which units
>>> (ms, us, ns) a given time_stats should be printed in; that's handled
>>> dynamically.
>>>
>>> - There's a lazily-allocated percpu buffer, which now needs to be freed
>>> with time_stats_exit().
>>>
>>> Signed-off-by: Kent Overstreet <[email protected]>
>>> Cc: Coly Li <[email protected]>
>>
>> Hi Kent,
>>
>> Overall I am OK with the change to bcache code. You may add
>> Acked-by: Coly Li <[email protected]>
>> in future version of this patch.
>>
>> In bcache-tools, they don’t read the changed sysfs files (including bcache-status), IMHO changing the output format won’t be problem for upstream.
>>
>> My only question is, how to understand the time_stats_to_text() output format,
>> count: 3
>> rate: 0/sec
>> frequency: 4 sec
>> avg duration: 4 sec
>> max duration: 4 sec
>> quantiles (ns): 0 4288669120 4288669120 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048
>>
>> Fro the above output, what are “rate”, “frequence” and “quantiles” for?
>
> Rate and frequency are inverses - in this example, we're seeing 4 events per
> second.
>

Can I understand that rate = 1/frequency ? Then frequency 4 is around to rate 0.

> The quantiles are for the duration, they give you an idea of the statistical
> distribution, see https://en.wikipedia.org/wiki/Quantile

I wanted to ask how to read the quantiles line. Does it mean that 1 ns is equally divided by 15 segments, and the counter values are for the divided 1/15 ns segments?

>
> In the near future, the quantiles will be removed and replaced with standard
> deviation, true and weighted - the quantiles algorithm isn't super accurate, we
> can give more accurate numbers with standard deviation.

Quantiles are much humane, standard deviation is scared… Just FYI LOL.

Thanks.

Coly Li

2022-08-31 17:18:57

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 2/3] bcache: Convert to lib/time_stats

On Thu, Sep 01, 2022 at 12:00:17AM +0800, Coly Li wrote:
>
>
> > 2022年8月30日 00:53,Kent Overstreet <[email protected]> 写道:
> >
> > This patch converts bcache to the new generic time_stats code
> > lib/time_stats.c. The new code is from bcachefs, and has some changes
> > from the version in bcache:
> >
> > - we now use ktime_get_ns(), not local_clock(). When the code was
> > originally written multi processor systems that lacked synchronized
> > TSCs were still common, and so local_clock() was much cheaper than
> > sched_clock() (though not necessarily fully accurate, due to TSC
> > drift). ktime_get_ns() should be cheap enough on all common hardware
> > now, and more standard/correct.
> >
> > - time_stats are now exported in a single file in sysfs, which means we
> > can improve the statistics we keep track of without changing all
> > users. This also means we don't have to manually specify which units
> > (ms, us, ns) a given time_stats should be printed in; that's handled
> > dynamically.
> >
> > - There's a lazily-allocated percpu buffer, which now needs to be freed
> > with time_stats_exit().
> >
> > Signed-off-by: Kent Overstreet <[email protected]>
> > Cc: Coly Li <[email protected]>
>
> Hi Kent,
>
> Overall I am OK with the change to bcache code. You may add
> Acked-by: Coly Li <[email protected]>
> in future version of this patch.
>
> In bcache-tools, they don’t read the changed sysfs files (including bcache-status), IMHO changing the output format won’t be problem for upstream.
>
> My only question is, how to understand the time_stats_to_text() output format,
> count: 3
> rate: 0/sec
> frequency: 4 sec
> avg duration: 4 sec
> max duration: 4 sec
> quantiles (ns): 0 4288669120 4288669120 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048
>
> Fro the above output, what are “rate”, “frequence” and “quantiles” for?

Rate and frequency are inverses - in this example, we're seeing 4 events per
second.

The quantiles are for the duration, they give you an idea of the statistical
distribution, see https://en.wikipedia.org/wiki/Quantile

In the near future, the quantiles will be removed and replaced with standard
deviation, true and weighted - the quantiles algorithm isn't super accurate, we
can give more accurate numbers with standard deviation.

2022-08-31 17:48:15

by Kent Overstreet

[permalink] [raw]
Subject: Re: [PATCH 2/3] bcache: Convert to lib/time_stats

On Thu, Sep 01, 2022 at 01:02:37AM +0800, Coly Li wrote:
> Can I understand that rate = 1/frequency ? Then frequency 4 is around to rate 0.

Correct, rate = 1/frequency.

> > The quantiles are for the duration, they give you an idea of the statistical
> > distribution, see https://en.wikipedia.org/wiki/Quantile
>
> I wanted to ask how to read the quantiles line. Does it mean that 1 ns is
> equally divided by 15 segments, and the counter values are for the divided
> 1/15 ns segments?

It is the 15 boundaries between 16 equal segments, yes.

> Quantiles are much humane, standard deviation is scared… Just FYI LOL.

That's valuable feedback then, maybe we'll keep both. The more expensive part of
time stats is just ingesting data points, but we've got a percpu buffer frontend
for that - just doing a bit more moth doesn't cost much.

2022-09-01 02:48:14

by Coly Li

[permalink] [raw]
Subject: Re: [PATCH 2/3] bcache: Convert to lib/time_stats



> 2022年9月1日 01:36,Kent Overstreet <[email protected]> 写道:
>
> On Thu, Sep 01, 2022 at 01:02:37AM +0800, Coly Li wrote:
>> Can I understand that rate = 1/frequency ? Then frequency 4 is around to rate 0.
>
> Correct, rate = 1/frequency.
>
>>> The quantiles are for the duration, they give you an idea of the statistical
>>> distribution, see https://en.wikipedia.org/wiki/Quantile
>>
>> I wanted to ask how to read the quantiles line. Does it mean that 1 ns is
>> equally divided by 15 segments, and the counter values are for the divided
>> 1/15 ns segments?
>
> It is the 15 boundaries between 16 equal segments, yes.
>
>> Quantiles are much humane, standard deviation is scared… Just FYI LOL.
>
> That's valuable feedback then, maybe we'll keep both. The more expensive part of
> time stats is just ingesting data points, but we've got a percpu buffer frontend
> for that - just doing a bit more moth doesn't cost much.

I don’t have more question so far. Thanks for the detailed replay.

Coly Li