2021-08-31 14:50:35

by Alex Zhuravlev

[permalink] [raw]
Subject: [RFC] replace revoke hash table with rhashtable

Hi,

Not so long ago we noticed that journal replay can take quite a lot (hours) in cases where many journaled blocks were freed during a short period.
I benchmarked hash table used by revoke code, basically it’s lookup+insert like jbd2 does at replay:

1048576 records - 95 seconds
2097152 records - 580 seconds

Then I benchmarked rhashtable:
1048576 records - 2 seconds
2097152 records - 3 seconds
4194304 records - 7 seconds

So, here is a patch replacing existing fixed-size hash table with rhashtable, please have a look.

Thanks, Alex


From 64b9161db7fee4eea665833765221d8a7e5903b6 Mon Sep 17 00:00:00 2001
From: Alex Zhuravlev <[email protected]>
Date: Tue, 31 Aug 2021 11:53:09 +0300
Subject: [PATCH] jbd2 to replace fixed-size revoke hashtable with rhashtable

Signed-off-by: Alex Zhuravlev <[email protected]>
---
fs/jbd2/journal.c | 5 +-
fs/jbd2/revoke.c | 255 +++++++++++++++----------------------------
include/linux/jbd2.h | 8 +-
3 files changed, 90 insertions(+), 178 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 35302bc192eb..6e3a2cc9dfd2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1370,7 +1370,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
journal->j_flags = JBD2_ABORT;

/* Set up a default-sized revoke table for the new mount. */
- err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+ err = jbd2_journal_init_revoke(journal);
if (err)
goto err_cleanup;

@@ -3136,8 +3136,6 @@ static int __init journal_init_caches(void)
int ret;

ret = jbd2_journal_init_revoke_record_cache();
- if (ret == 0)
- ret = jbd2_journal_init_revoke_table_cache();
if (ret == 0)
ret = jbd2_journal_init_journal_head_cache();
if (ret == 0)
@@ -3152,7 +3150,6 @@ static int __init journal_init_caches(void)
static void jbd2_journal_destroy_caches(void)
{
jbd2_journal_destroy_revoke_record_cache();
- jbd2_journal_destroy_revoke_table_cache();
jbd2_journal_destroy_journal_head_cache();
jbd2_journal_destroy_handle_cache();
jbd2_journal_destroy_inode_cache();
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index fa608788b93d..4533bd49e879 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -90,10 +90,10 @@
#include <linux/bio.h>
#include <linux/log2.h>
#include <linux/hash.h>
+#include <linux/rhashtable.h>
#endif

static struct kmem_cache *jbd2_revoke_record_cache;
-static struct kmem_cache *jbd2_revoke_table_cache;

/* Each revoke record represents one single revoked block. During
journal replay, this involves recording the transaction ID of the
@@ -101,23 +101,17 @@ static struct kmem_cache *jbd2_revoke_table_cache;

struct jbd2_revoke_record_s
{
- struct list_head hash;
+ struct rhash_head linkage;
tid_t sequence; /* Used for recovery only */
unsigned long long blocknr;
};

-
-/* The revoke table is just a simple hash table of revoke records. */
-struct jbd2_revoke_table_s
-{
- /* It is conceivable that we might want a larger hash table
- * for recovery. Must be a power of two. */
- int hash_size;
- int hash_shift;
- struct list_head *hash_table;
+static const struct rhashtable_params revoke_rhashtable_params = {
+ .key_len = sizeof(unsigned long long),
+ .key_offset = offsetof(struct jbd2_revoke_record_s, blocknr),
+ .head_offset = offsetof(struct jbd2_revoke_record_s, linkage),
};

-
#ifdef __KERNEL__
static void write_one_revoke_record(transaction_t *,
struct list_head *,
@@ -126,18 +120,10 @@ static void write_one_revoke_record(transaction_t *,
static void flush_descriptor(journal_t *, struct buffer_head *, int);
#endif

-/* Utility functions to maintain the revoke table */
-
-static inline int hash(journal_t *journal, unsigned long long block)
-{
- return hash_64(block, journal->j_revoke->hash_shift);
-}
-
static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
tid_t seq)
{
- struct list_head *hash_list;
- struct jbd2_revoke_record_s *record;
+ struct jbd2_revoke_record_s *record, *old;
gfp_t gfp_mask = GFP_NOFS;

if (journal_oom_retry)
@@ -148,10 +134,12 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,

record->sequence = seq;
record->blocknr = blocknr;
- hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
- spin_lock(&journal->j_revoke_lock);
- list_add(&record->hash, hash_list);
- spin_unlock(&journal->j_revoke_lock);
+ old = rhashtable_lookup_get_insert_fast(journal->j_revoke,
+ &record->linkage, revoke_rhashtable_params);
+ if (old) {
+ BUG_ON(record->sequence != seq);
+ kmem_cache_free(jbd2_revoke_record_cache, record);
+ }
return 0;
}

@@ -160,22 +148,8 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
unsigned long long blocknr)
{
- struct list_head *hash_list;
- struct jbd2_revoke_record_s *record;
-
- hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
-
- spin_lock(&journal->j_revoke_lock);
- record = (struct jbd2_revoke_record_s *) hash_list->next;
- while (&(record->hash) != hash_list) {
- if (record->blocknr == blocknr) {
- spin_unlock(&journal->j_revoke_lock);
- return record;
- }
- record = (struct jbd2_revoke_record_s *) record->hash.next;
- }
- spin_unlock(&journal->j_revoke_lock);
- return NULL;
+ return rhashtable_lookup_fast(journal->j_revoke, &blocknr,
+ revoke_rhashtable_params);
}

void jbd2_journal_destroy_revoke_record_cache(void)
@@ -184,12 +158,6 @@ void jbd2_journal_destroy_revoke_record_cache(void)
jbd2_revoke_record_cache = NULL;
}

-void jbd2_journal_destroy_revoke_table_cache(void)
-{
- kmem_cache_destroy(jbd2_revoke_table_cache);
- jbd2_revoke_table_cache = NULL;
-}
-
int __init jbd2_journal_init_revoke_record_cache(void)
{
J_ASSERT(!jbd2_revoke_record_cache);
@@ -203,85 +171,27 @@ int __init jbd2_journal_init_revoke_record_cache(void)
return 0;
}

-int __init jbd2_journal_init_revoke_table_cache(void)
-{
- J_ASSERT(!jbd2_revoke_table_cache);
- jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
- SLAB_TEMPORARY);
- if (!jbd2_revoke_table_cache) {
- pr_emerg("JBD2: failed to create revoke_table cache\n");
- return -ENOMEM;
- }
- return 0;
-}
-
-static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
-{
- int shift = 0;
- int tmp = hash_size;
- struct jbd2_revoke_table_s *table;
-
- table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
- if (!table)
- goto out;
-
- while((tmp >>= 1UL) != 0UL)
- shift++;
-
- table->hash_size = hash_size;
- table->hash_shift = shift;
- table->hash_table =
- kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
- if (!table->hash_table) {
- kmem_cache_free(jbd2_revoke_table_cache, table);
- table = NULL;
- goto out;
- }
-
- for (tmp = 0; tmp < hash_size; tmp++)
- INIT_LIST_HEAD(&table->hash_table[tmp]);
-
-out:
- return table;
-}
-
-static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
-{
- int i;
- struct list_head *hash_list;
-
- for (i = 0; i < table->hash_size; i++) {
- hash_list = &table->hash_table[i];
- J_ASSERT(list_empty(hash_list));
- }
-
- kfree(table->hash_table);
- kmem_cache_free(jbd2_revoke_table_cache, table);
-}
-
/* Initialise the revoke table for a given journal to a given size. */
-int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
+int jbd2_journal_init_revoke(journal_t *journal)
{
- J_ASSERT(journal->j_revoke_table[0] == NULL);
- J_ASSERT(is_power_of_2(hash_size));
+ int rc;

- journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size);
- if (!journal->j_revoke_table[0])
+ rc = rhashtable_init(&journal->j_revoke_table[0], &revoke_rhashtable_params);
+ if (rc)
goto fail0;

- journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size);
- if (!journal->j_revoke_table[1])
+ rc = rhashtable_init(&journal->j_revoke_table[1], &revoke_rhashtable_params);
+ if (rc)
goto fail1;

- journal->j_revoke = journal->j_revoke_table[1];
+ journal->j_revoke = &journal->j_revoke_table[1];

spin_lock_init(&journal->j_revoke_lock);

return 0;

fail1:
- jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
- journal->j_revoke_table[0] = NULL;
+ rhashtable_destroy(&journal->j_revoke_table[0]);
fail0:
return -ENOMEM;
}
@@ -290,10 +200,8 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
void jbd2_journal_destroy_revoke(journal_t *journal)
{
journal->j_revoke = NULL;
- if (journal->j_revoke_table[0])
- jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
- if (journal->j_revoke_table[1])
- jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]);
+ rhashtable_destroy(&journal->j_revoke_table[0]);
+ rhashtable_destroy(&journal->j_revoke_table[1]);
}


@@ -446,9 +354,8 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
if (record) {
jbd_debug(4, "cancelled existing revoke on "
"blocknr %llu\n", (unsigned long long)bh->b_blocknr);
- spin_lock(&journal->j_revoke_lock);
- list_del(&record->hash);
- spin_unlock(&journal->j_revoke_lock);
+ rhashtable_remove_fast(journal->j_revoke, &record->linkage,
+ revoke_rhashtable_params);
kmem_cache_free(jbd2_revoke_record_cache, record);
did_revoke = 1;
}
@@ -483,27 +390,29 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
*/
void jbd2_clear_buffer_revoked_flags(journal_t *journal)
{
- struct jbd2_revoke_table_s *revoke = journal->j_revoke;
- int i = 0;
-
- for (i = 0; i < revoke->hash_size; i++) {
- struct list_head *hash_list;
- struct list_head *list_entry;
- hash_list = &revoke->hash_table[i];
-
- list_for_each(list_entry, hash_list) {
- struct jbd2_revoke_record_s *record;
- struct buffer_head *bh;
- record = (struct jbd2_revoke_record_s *)list_entry;
- bh = __find_get_block(journal->j_fs_dev,
- record->blocknr,
- journal->j_blocksize);
- if (bh) {
- clear_buffer_revoked(bh);
- __brelse(bh);
- }
+ struct rhashtable *revoke = journal->j_revoke;
+ struct jbd2_revoke_record_s *record;
+ struct rhashtable_iter iter;
+
+ rhashtable_walk_enter(revoke, &iter);
+ rhashtable_walk_start(&iter);
+ while ((record = rhashtable_walk_next(&iter)) != NULL) {
+ struct buffer_head *bh;
+
+ if (IS_ERR(record))
+ continue;
+ rhashtable_walk_stop(&iter);
+ bh = __find_get_block(journal->j_fs_dev,
+ record->blocknr,
+ journal->j_blocksize);
+ if (bh) {
+ clear_buffer_revoked(bh);
+ __brelse(bh);
}
- }
+ rhashtable_walk_start(&iter);
+ }
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
}

/* journal_switch_revoke table select j_revoke for next transaction
@@ -512,15 +421,12 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
*/
void jbd2_journal_switch_revoke_table(journal_t *journal)
{
- int i;
-
- if (journal->j_revoke == journal->j_revoke_table[0])
- journal->j_revoke = journal->j_revoke_table[1];
+ if (journal->j_revoke == &journal->j_revoke_table[0])
+ journal->j_revoke = &journal->j_revoke_table[1];
else
- journal->j_revoke = journal->j_revoke_table[0];
+ journal->j_revoke = &journal->j_revoke_table[0];

- for (i = 0; i < journal->j_revoke->hash_size; i++)
- INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
+ /* XXX: check rhashtable is empty? reinitialize it? */
}

/*
@@ -533,31 +439,36 @@ void jbd2_journal_write_revoke_records(transaction_t *transaction,
journal_t *journal = transaction->t_journal;
struct buffer_head *descriptor;
struct jbd2_revoke_record_s *record;
- struct jbd2_revoke_table_s *revoke;
- struct list_head *hash_list;
- int i, offset, count;
+ struct rhashtable_iter iter;
+ struct rhashtable *revoke;
+ int offset, count;

descriptor = NULL;
offset = 0;
count = 0;

/* select revoke table for committing transaction */
- revoke = journal->j_revoke == journal->j_revoke_table[0] ?
- journal->j_revoke_table[1] : journal->j_revoke_table[0];
-
- for (i = 0; i < revoke->hash_size; i++) {
- hash_list = &revoke->hash_table[i];
-
- while (!list_empty(hash_list)) {
- record = (struct jbd2_revoke_record_s *)
- hash_list->next;
+ revoke = journal->j_revoke == &journal->j_revoke_table[0] ?
+ &journal->j_revoke_table[1] : &journal->j_revoke_table[0];
+
+ rhashtable_walk_enter(revoke, &iter);
+ rhashtable_walk_start(&iter);
+ while ((record = rhashtable_walk_next(&iter)) != NULL) {
+ if (IS_ERR(record))
+ continue;
+ if (rhashtable_remove_fast(revoke,
+ &record->linkage,
+ revoke_rhashtable_params) == 0) {
+ rhashtable_walk_stop(&iter);
write_one_revoke_record(transaction, log_bufs,
&descriptor, &offset, record);
+ rhashtable_walk_start(&iter);
count++;
- list_del(&record->hash);
kmem_cache_free(jbd2_revoke_record_cache, record);
}
}
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
if (descriptor)
flush_descriptor(journal, descriptor, offset);
jbd_debug(1, "Wrote %d revoke records\n", count);
@@ -725,19 +636,23 @@ int jbd2_journal_test_revoke(journal_t *journal,

void jbd2_journal_clear_revoke(journal_t *journal)
{
- int i;
- struct list_head *hash_list;
+ struct rhashtable_iter iter;
struct jbd2_revoke_record_s *record;
- struct jbd2_revoke_table_s *revoke;
+ struct rhashtable *revoke;

revoke = journal->j_revoke;

- for (i = 0; i < revoke->hash_size; i++) {
- hash_list = &revoke->hash_table[i];
- while (!list_empty(hash_list)) {
- record = (struct jbd2_revoke_record_s*) hash_list->next;
- list_del(&record->hash);
+ rhashtable_walk_enter(revoke, &iter);
+ rhashtable_walk_start(&iter);
+ while ((record = rhashtable_walk_next(&iter)) != NULL) {
+ if (IS_ERR(record))
+ continue;
+ if (rhashtable_remove_fast(revoke,
+ &record->linkage,
+ revoke_rhashtable_params) == 0) {
kmem_cache_free(jbd2_revoke_record_cache, record);
- }
- }
+ }
+ }
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index fd933c45281a..dcde4329cdd1 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -29,6 +29,7 @@
#include <linux/bit_spinlock.h>
#include <linux/blkdev.h>
#include <crypto/hash.h>
+#include <linux/rhashtable.h>
#endif

#define journal_oom_retry 1
@@ -1135,12 +1136,12 @@ struct journal_s
* The revoke table - maintains the list of revoked blocks in the
* current transaction.
*/
- struct jbd2_revoke_table_s *j_revoke;
+ struct rhashtable *j_revoke;

/**
* @j_revoke_table: Alternate revoke tables for j_revoke.
*/
- struct jbd2_revoke_table_s *j_revoke_table[2];
+ struct rhashtable j_revoke_table[2];

/**
* @j_wbuf: Array of bhs for jbd2_journal_commit_transaction.
@@ -1625,8 +1626,7 @@ static inline void jbd2_free_inode(struct jbd2_inode *jinode)
}

/* Primary revoke support */
-#define JOURNAL_REVOKE_DEFAULT_HASH 256
-extern int jbd2_journal_init_revoke(journal_t *, int);
+extern int jbd2_journal_init_revoke(journal_t *);
extern void jbd2_journal_destroy_revoke_record_cache(void);
extern void jbd2_journal_destroy_revoke_table_cache(void);
extern int __init jbd2_journal_init_revoke_record_cache(void);
--
2.26.3



2021-09-01 18:33:57

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] replace revoke hash table with rhashtable

On Aug 31, 2021, at 8:49 AM, Alex Zhuravlev <[email protected]> wrote:
>
> Hi,
>
> Not so long ago we noticed that journal replay can take quite a lot (hours)
> in cases where many journaled blocks were freed during a short period.

It may be worthwhile to mention this was a case with a 4GB journal size.

> I benchmarked hash table used by revoke code, basically it’s lookup+insert
> like jbd2 does at replay:
>
> 1048576 records - 95 seconds
> 2097152 records - 580 seconds
>
> Then I benchmarked rhashtable:
> 1048576 records - 2 seconds
> 2097152 records - 3 seconds
> 4194304 records - 7 seconds
>
> So, here is a patch replacing existing fixed-size hash table with rhashtable, please have a look.
>
> Thanks, Alex

Alex,
the patch looks good from both a performance standpoint, as well as
a good reduction in lines (and possibly memory, for the cases where
there are fewer entries in the hash than the static table size).

I did notice some lines are using 8-space indents instead of tabs.
That can be fixed if there are any other comments, and you resubmit
without [RFC].

Cheers, Andreas






Attachments:
signature.asc (890.00 B)
Message signed with OpenPGP