2023-06-26 19:31:37

by Marcelo Tosatti

[permalink] [raw]
Subject: [PATCH] fs/buffer.c: remove per-CPU buffer_head lookup cache


For certain types of applications (for example PLC software or
RAN processing), upon occurrence of an event, it is necessary to
complete a certain task in a maximum amount of time (deadline).

One way to express this requirement is with a pair of numbers,
deadline time and execution time, where:

* deadline time: length of time between event and deadline.
* execution time: length of time it takes for processing of event
to occur on a particular hardware platform
(uninterrupted).

The particular values depend on use-case. For the case
where the realtime application executes in a virtualized
guest, an IPI which must be serviced in the host will cause
the following sequence of events:

1) VM-exit
2) execution of IPI (and function call)
3) VM-entry

Which causes an excess of 50us latency as observed by cyclictest
(this violates the latency requirement of vRAN application with 1ms TTI,
for example).

invalidate_bh_lrus calls an IPI on each CPU that has non empty
per-CPU cache:

on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);

Upon closer investigation, it was found that in current codebase, lookup_bh_lru
is slower than __find_get_block_slow:

114 ns per __find_get_block
68 ns per __find_get_block_slow

So remove the per-CPU buffer_head caching.

Test program:

#define NRLOOPS 200000
static int __init example_init(void)
{
ktime_t s, e;
s64 delta;
int i, suc;

bdev = blkdev_get_by_path("/dev/loop0", FMODE_READ, NULL);
if (IS_ERR(bdev)) {
printk(KERN_ERR "failed to load /dev/loop0\n");
return -ENODEV;
}

suc = 0;
delta = 0;
for (i=0; i < NRLOOPS; i++) {
struct buffer_head *bh;

s = ktime_get();
bh = __find_get_block(bdev, 1, 512);
e = ktime_get();
if (bh) {
suc++;
__brelse(bh);
}
delta = delta + ktime_to_ns(ktime_sub(e, s));

}
printk(KERN_ERR "%lld ns per __find_get_block (suc=%d)\n", delta/NRLOOPS, suc);

suc = 0;
delta = 0;
for (i=0; i < NRLOOPS; i++) {
struct buffer_head *bh;

s = ktime_get();
bh = __find_get_block_slow(bdev, 1);
e = ktime_get();
if (bh) {
suc++;
__brelse(bh);
}
delta = delta + ktime_to_ns(ktime_sub(e, s));
}
printk(KERN_ERR "%lld ns per __find_get_block_slow (suc=%d)\n", delta/NRLOOPS, suc);

return 0;
}

Signed-off-by: Marcelo Tosatti <[email protected]>

---

block/bdev.c | 2 -
fs/buffer.c | 209 +++++-------------------------------------------------------------------------------------------------------------------------------------------------------
fs/jbd2/revoke.c | 9 ++----
fs/mpage.c | 3 --
fs/ocfs2/journal.c | 3 --
fs/reiserfs/reiserfs.h | 2 -
include/linux/buffer_head.h | 11 ++------
mm/migrate.c | 12 +-------
mm/swap.c | 4 --
9 files changed, 21 insertions(+), 234 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index 21c63bfef323..dc511024b11f 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -72,7 +72,6 @@ static void kill_bdev(struct block_device *bdev)
if (mapping_empty(mapping))
return;

- invalidate_bh_lrus();
truncate_inode_pages(mapping, 0);
}

@@ -82,7 +81,6 @@ void invalidate_bdev(struct block_device *bdev)
struct address_space *mapping = bdev->bd_inode->i_mapping;

if (mapping->nrpages) {
- invalidate_bh_lrus();
lru_add_drain_all(); /* make sure all lru add caches are flushed */
invalidate_mapping_pages(mapping, 0, -1);
}
diff --git a/fs/buffer.c b/fs/buffer.c
index a7fc561758b1..916d35af8628 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -186,8 +186,8 @@ EXPORT_SYMBOL(end_buffer_write_sync);
* may be quite high. This code could TryLock the page, and if that
* succeeds, there is no need to take private_lock.
*/
-static struct buffer_head *
-__find_get_block_slow(struct block_device *bdev, sector_t block)
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block)
{
struct inode *bd_inode = bdev->bd_inode;
struct address_space *bd_mapping = bd_inode->i_mapping;
@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
*/
ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
if (all_mapped && __ratelimit(&last_warned)) {
- printk("__find_get_block_slow() failed. block=%llu, "
+ printk("__find_get_block() failed. block=%llu, "
"b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
"device %pg blocksize: %d\n",
(unsigned long long)block,
@@ -241,6 +241,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
out:
return ret;
}
+EXPORT_SYMBOL(__find_get_block);

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
@@ -598,10 +599,9 @@ EXPORT_SYMBOL(sync_mapping_buffers);
* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
* dirty, schedule it for IO. So that indirects merge nicely with their data.
*/
-void write_boundary_block(struct block_device *bdev,
- sector_t bblock, unsigned blocksize)
+void write_boundary_block(struct block_device *bdev, sector_t bblock)
{
- struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
+ struct buffer_head *bh = __find_get_block(bdev, bblock + 1);
if (bh) {
if (buffer_dirty(bh))
write_dirty_buffer(bh, 0);
@@ -1080,7 +1080,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
struct buffer_head *bh;
int ret;

- bh = __find_get_block(bdev, block, size);
+ bh = __find_get_block(bdev, block);
if (bh)
return bh;

@@ -1232,137 +1232,6 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
return NULL;
}

-/*
- * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
- * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
- * refcount elevated by one when they're in an LRU. A buffer can only appear
- * once in a particular CPU's LRU. A single buffer can be present in multiple
- * CPU's LRUs at the same time.
- *
- * This is a transparent caching front-end to sb_bread(), sb_getblk() and
- * sb_find_get_block().
- *
- * The LRUs themselves only need locking against invalidate_bh_lrus. We use
- * a local interrupt disable for that.
- */
-
-#define BH_LRU_SIZE 16
-
-struct bh_lru {
- struct buffer_head *bhs[BH_LRU_SIZE];
-};
-
-static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
-
-#ifdef CONFIG_SMP
-#define bh_lru_lock() local_irq_disable()
-#define bh_lru_unlock() local_irq_enable()
-#else
-#define bh_lru_lock() preempt_disable()
-#define bh_lru_unlock() preempt_enable()
-#endif
-
-static inline void check_irqs_on(void)
-{
-#ifdef irqs_disabled
- BUG_ON(irqs_disabled());
-#endif
-}
-
-/*
- * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
- * inserted at the front, and the buffer_head at the back if any is evicted.
- * Or, if already in the LRU it is moved to the front.
- */
-static void bh_lru_install(struct buffer_head *bh)
-{
- struct buffer_head *evictee = bh;
- struct bh_lru *b;
- int i;
-
- check_irqs_on();
- bh_lru_lock();
-
- /*
- * the refcount of buffer_head in bh_lru prevents dropping the
- * attached page(i.e., try_to_free_buffers) so it could cause
- * failing page migration.
- * Skip putting upcoming bh into bh_lru until migration is done.
- */
- if (lru_cache_disabled()) {
- bh_lru_unlock();
- return;
- }
-
- b = this_cpu_ptr(&bh_lrus);
- for (i = 0; i < BH_LRU_SIZE; i++) {
- swap(evictee, b->bhs[i]);
- if (evictee == bh) {
- bh_lru_unlock();
- return;
- }
- }
-
- get_bh(bh);
- bh_lru_unlock();
- brelse(evictee);
-}
-
-/*
- * Look up the bh in this cpu's LRU. If it's there, move it to the head.
- */
-static struct buffer_head *
-lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
-{
- struct buffer_head *ret = NULL;
- unsigned int i;
-
- check_irqs_on();
- bh_lru_lock();
- for (i = 0; i < BH_LRU_SIZE; i++) {
- struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
-
- if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
- bh->b_size == size) {
- if (i) {
- while (i) {
- __this_cpu_write(bh_lrus.bhs[i],
- __this_cpu_read(bh_lrus.bhs[i - 1]));
- i--;
- }
- __this_cpu_write(bh_lrus.bhs[0], bh);
- }
- get_bh(bh);
- ret = bh;
- break;
- }
- }
- bh_lru_unlock();
- return ret;
-}
-
-/*
- * Perform a pagecache lookup for the matching buffer. If it's there, refresh
- * it in the LRU and mark it as accessed. If it is not present then return
- * NULL
- */
-struct buffer_head *
-__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
-{
- struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
-
- if (bh == NULL) {
- /* __find_get_block_slow will mark the page accessed */
- bh = __find_get_block_slow(bdev, block);
- if (bh)
- bh_lru_install(bh);
- } else
- touch_buffer(bh);
-
- return bh;
-}
-EXPORT_SYMBOL(__find_get_block);
-
/*
* __getblk_gfp() will locate (and, if necessary, create) the buffer_head
* which corresponds to the passed block_device, block and size. The
@@ -1375,7 +1244,7 @@ struct buffer_head *
__getblk_gfp(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- struct buffer_head *bh = __find_get_block(bdev, block, size);
+ struct buffer_head *bh = __find_get_block(bdev, block);

might_sleep();
if (bh == NULL)
@@ -1421,61 +1290,6 @@ __bread_gfp(struct block_device *bdev, sector_t block,
}
EXPORT_SYMBOL(__bread_gfp);

-static void __invalidate_bh_lrus(struct bh_lru *b)
-{
- int i;
-
- for (i = 0; i < BH_LRU_SIZE; i++) {
- brelse(b->bhs[i]);
- b->bhs[i] = NULL;
- }
-}
-/*
- * invalidate_bh_lrus() is called rarely - but not only at unmount.
- * This doesn't race because it runs in each cpu either in irq
- * or with preempt disabled.
- */
-static void invalidate_bh_lru(void *arg)
-{
- struct bh_lru *b = &get_cpu_var(bh_lrus);
-
- __invalidate_bh_lrus(b);
- put_cpu_var(bh_lrus);
-}
-
-bool has_bh_in_lru(int cpu, void *dummy)
-{
- struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
- int i;
-
- for (i = 0; i < BH_LRU_SIZE; i++) {
- if (b->bhs[i])
- return true;
- }
-
- return false;
-}
-
-void invalidate_bh_lrus(void)
-{
- on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
-}
-EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
-
-/*
- * It's called from workqueue context so we need a bh_lru_lock to close
- * the race with preemption/irq.
- */
-void invalidate_bh_lrus_cpu(void)
-{
- struct bh_lru *b;
-
- bh_lru_lock();
- b = this_cpu_ptr(&bh_lrus);
- __invalidate_bh_lrus(b);
- bh_lru_unlock();
-}
-
void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset)
{
@@ -2997,13 +2811,6 @@ EXPORT_SYMBOL(free_buffer_head);

static int buffer_exit_cpu_dead(unsigned int cpu)
{
- int i;
- struct bh_lru *b = &per_cpu(bh_lrus, cpu);
-
- for (i = 0; i < BH_LRU_SIZE; i++) {
- brelse(b->bhs[i]);
- b->bhs[i] = NULL;
- }
this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
per_cpu(bh_accounting, cpu).nr = 0;
return 0;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 4556e4689024..f68b9207737d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -345,7 +345,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
bh = bh_in;

if (!bh) {
- bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh = __find_get_block(bdev, blocknr);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
@@ -355,7 +355,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,

/* If there is a different buffer_head lying around in
* memory anywhere... */
- bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh2 = __find_get_block(bdev, blocknr);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
@@ -466,7 +466,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
- bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+ bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
@@ -496,8 +496,7 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
struct buffer_head *bh;
record = (struct jbd2_revoke_record_s *)list_entry;
bh = __find_get_block(journal->j_fs_dev,
- record->blocknr,
- journal->j_blocksize);
+ record->blocknr);
if (bh) {
clear_buffer_revoked(bh);
__brelse(bh);
diff --git a/fs/mpage.c b/fs/mpage.c
index 242e213ee064..e50d30a009ce 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -634,8 +634,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
if (boundary || (first_unmapped != blocks_per_page)) {
bio = mpage_bio_submit_write(bio);
if (boundary_block) {
- write_boundary_block(boundary_bdev,
- boundary_block, 1 << blkbits);
+ write_boundary_block(boundary_bdev, boundary_block);
}
} else {
mpd->last_block_in_bio = blocks[blocks_per_page - 1];
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 25d8072ccfce..12be1471c9aa 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1212,8 +1212,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
}

for (i = 0; i < p_blocks; i++, p_blkno++) {
- bh = __find_get_block(osb->sb->s_bdev, p_blkno,
- osb->sb->s_blocksize);
+ bh = __find_get_block(osb->sb->s_bdev, p_blkno);
/* block not cached. */
if (!bh)
continue;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 1bccf6a2e908..19708f600bce 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2810,7 +2810,7 @@ struct reiserfs_journal_header {
#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])

/* We need these to make journal.c code more readable */
-#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block)
#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 1520793c72da..084a9d5f53d3 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -227,8 +227,7 @@ static inline void clean_bdev_bh_alias(struct buffer_head *bh)
void mark_buffer_async_write(struct buffer_head *bh);
void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
-struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
- unsigned size);
+struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block);
struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp);
void __brelse(struct buffer_head *);
@@ -236,9 +235,6 @@ void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
struct buffer_head *__bread_gfp(struct block_device *,
sector_t block, unsigned size, gfp_t gfp);
-void invalidate_bh_lrus(void);
-void invalidate_bh_lrus_cpu(void);
-bool has_bh_in_lru(int cpu, void *dummy);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
@@ -247,8 +243,7 @@ int sync_dirty_buffer(struct buffer_head *bh);
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void submit_bh(blk_opf_t, struct buffer_head *);
-void write_boundary_block(struct block_device *bdev,
- sector_t bblock, unsigned blocksize);
+void write_boundary_block(struct block_device *bdev, sector_t bblock);
int bh_uptodate_or_lock(struct buffer_head *bh);
int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
void __bh_read_batch(int nr, struct buffer_head *bhs[],
@@ -375,7 +370,7 @@ sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
static inline struct buffer_head *
sb_find_get_block(struct super_block *sb, sector_t block)
{
- return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
+ return __find_get_block(sb->s_bdev, block);
}

static inline void
diff --git a/mm/migrate.c b/mm/migrate.c
index 01cac26a3127..ceecd95cfd49 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -747,9 +747,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,

if (check_refs) {
bool busy;
- bool invalidated = false;

-recheck_buffers:
busy = false;
spin_lock(&mapping->private_lock);
bh = head;
@@ -761,14 +759,8 @@ static int __buffer_migrate_folio(struct address_space *mapping,
bh = bh->b_this_page;
} while (bh != head);
if (busy) {
- if (invalidated) {
- rc = -EAGAIN;
- goto unlock_buffers;
- }
- spin_unlock(&mapping->private_lock);
- invalidate_bh_lrus();
- invalidated = true;
- goto recheck_buffers;
+ rc = -EAGAIN;
+ goto unlock_buffers;
}
}

diff --git a/mm/swap.c b/mm/swap.c
index 423199ee8478..64ce7255ff4d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -765,7 +765,6 @@ static void lru_add_and_bh_lrus_drain(void)
local_lock(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
local_unlock(&cpu_fbatches.lock);
- invalidate_bh_lrus_cpu();
mlock_drain_local();
}

@@ -798,8 +797,7 @@ static bool cpu_needs_drain(unsigned int cpu)
folio_batch_count(&fbatches->lru_deactivate) ||
folio_batch_count(&fbatches->lru_lazyfree) ||
folio_batch_count(&fbatches->activate) ||
- need_mlock_drain(cpu) ||
- has_bh_in_lru(cpu, NULL);
+ need_mlock_drain(cpu);
}

/*



2023-06-26 22:55:34

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH] fs/buffer.c: remove per-CPU buffer_head lookup cache

On Mon, Jun 26, 2023 at 03:04:53PM -0300, Marcelo Tosatti wrote:
> Upon closer investigation, it was found that in current codebase, lookup_bh_lru
> is slower than __find_get_block_slow:
>
> 114 ns per __find_get_block
> 68 ns per __find_get_block_slow
>
> So remove the per-CPU buffer_head caching.
>
> Test program:
>
> #define NRLOOPS 200000
> static int __init example_init(void)
> {
> ktime_t s, e;
> s64 delta;
> int i, suc;
>
> bdev = blkdev_get_by_path("/dev/loop0", FMODE_READ, NULL);
> if (IS_ERR(bdev)) {
> printk(KERN_ERR "failed to load /dev/loop0\n");
> return -ENODEV;
> }
>
> suc = 0;
> delta = 0;
> for (i=0; i < NRLOOPS; i++) {
> struct buffer_head *bh;
>
> s = ktime_get();
> bh = __find_get_block(bdev, 1, 512);
> e = ktime_get();
> if (bh) {
> suc++;
> __brelse(bh);
> }
> delta = delta + ktime_to_ns(ktime_sub(e, s));
>
> }
> printk(KERN_ERR "%lld ns per __find_get_block (suc=%d)\n", delta/NRLOOPS, suc);
>
> suc = 0;
> delta = 0;
> for (i=0; i < NRLOOPS; i++) {
> struct buffer_head *bh;
>
> s = ktime_get();
> bh = __find_get_block_slow(bdev, 1);
> e = ktime_get();
> if (bh) {
> suc++;
> __brelse(bh);
> }
> delta = delta + ktime_to_ns(ktime_sub(e, s));
> }
> printk(KERN_ERR "%lld ns per __find_get_block_slow (suc=%d)\n", delta/NRLOOPS, suc);

It occurs to me that this is close to being the best-case scenario for
page-cache lookup as well as for lru lookup. Can you re-run it with
block 4UL * 1024 * 1024 * 1024 instead of block 1?

2023-06-27 06:48:24

by Christoph Hellwig

[permalink] [raw]