2022-11-21 19:07:15

by Alexey Romanov

[permalink] [raw]
Subject: [RFC PATCH v1 1/4] zram: introduce merge identical pages mechanism

zram maps each page (struct page) to a zsmalloc object that
contains a compressed buffer of that page's data. This fact
generates data redundancy: for example, two identical pages
will be store in compressed form in zsmalloc allocator twice.

This patch adds a mechanism to scan zram_table_entry array
and frees all identical objects in zsmalloc allocator, leaving
only one. All zram_table_entry elements which reference this
freed objects now refer to the same, not freed, object.

To implement this mechanism, we sequentially scan the
zram_table_entry array, counting the hash from the contents of
the compressed pages (zsmalloc objects) and enter the index of
the object into the hash table (hlist_head). If the hash matches,
we remove the identical zsmalloc (zs_free) objects and update
the link rbtree.

Also, we can't just call zs_free() function during zram_free_page().
Before calling this function we have to make sure that no one else
refers to this zsmalloc object.

To implement the mechanism for merging identical compressed
pages (zsmalloc objects), a rbtree is needed.

The tree node key is a reference to the zsmalloc object
(handle), and the value is the number of references to
this object (atomic counter). This is necessary for data
consistency so that we do not zs_free the object referenced
by any element of the zram_table_entry array.

Signed-off-by: Alexey Romanov <[email protected]>
---
drivers/block/zram/zram_drv.c | 278 +++++++++++++++++++++++++++++++++-
drivers/block/zram/zram_drv.h | 6 +
2 files changed, 283 insertions(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index e290d6d97047..716c2f72805e 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -33,12 +33,15 @@
#include <linux/debugfs.h>
#include <linux/cpuhotplug.h>
#include <linux/part_stat.h>
+#include <linux/hashtable.h>
+#include <linux/xxhash.h>

#include "zram_drv.h"

static DEFINE_IDR(zram_index_idr);
/* idr index must be protected */
static DEFINE_MUTEX(zram_index_mutex);
+static DEFINE_MUTEX(zram_rbtree_mutex);

static int zram_major;
static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
@@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index);
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
u32 index, int offset, struct bio *bio);

+struct zram_rbtree_node {
+ struct rb_node node;
+ unsigned long key;
+ unsigned long cnt;
+};
+
+struct zram_hash_node {
+ unsigned long index;
+ struct hlist_node next;
+};

static int zram_slot_trylock(struct zram *zram, u32 index)
{
@@ -1283,6 +1296,247 @@ static DEVICE_ATTR_RO(bd_stat);
#endif
static DEVICE_ATTR_RO(debug_stat);

+static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct zram_rbtree_node *this;
+
+ while (*new) {
+ this = rb_entry(*new, struct zram_rbtree_node, node);
+ parent = *new;
+ if (data->key < this->key)
+ new = &((*new)->rb_left);
+ else if (data->key > this->key)
+ new = &((*new)->rb_right);
+ else
+ return false;
+ }
+
+ rb_link_node(&data->node, parent, new);
+ rb_insert_color(&data->node, root);
+ return true;
+}
+
+static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root,
+ unsigned long key)
+{
+ struct rb_node *node = root->rb_node;
+ struct zram_rbtree_node *data;
+
+ while (node) {
+ data = rb_entry(node, struct zram_rbtree_node, node);
+ if (key < data->key)
+ node = node->rb_left;
+ else if (key > data->key)
+ node = node->rb_right;
+ else
+ return data;
+ }
+
+ return NULL;
+}
+
+static unsigned long zram_calc_hash(void *src, size_t len)
+{
+ return xxhash(src, len, 0);
+}
+
+static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable,
+ size_t htable_size, size_t index)
+{
+ struct zram_rbtree_node *rb_node;
+ struct zram_hash_node *node;
+ unsigned long handle, cur_handle;
+ size_t obj_size;
+ char *src, *buf;
+ unsigned long hash;
+ int ret = 0;
+
+ handle = zram_get_handle(zram, index);
+ if (!handle)
+ return ret;
+
+ obj_size = zram_get_obj_size(zram, index);
+ buf = kmalloc(obj_size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("Failed to allocate zs_map_object buffer\n");
+ return -ENOMEM;
+ }
+
+ src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
+ memcpy(buf, src, obj_size);
+ zs_unmap_object(zram->mem_pool, handle);
+ hash = zram_calc_hash(buf, obj_size);
+
+ mutex_lock(&zram_rbtree_mutex);
+ hlist_for_each_entry(node, &htable[hash % htable_size], next) {
+ int cmp;
+
+ zram_slot_lock(zram, node->index);
+
+ /*
+ * Page may change as the hash table is being formed,
+ * so the checks below are necessary.
+ */
+ cur_handle = zram_get_handle(zram, node->index);
+ if (handle == cur_handle ||
+ obj_size != zram_get_obj_size(zram, node->index)) {
+ zram_slot_unlock(zram, node->index);
+ continue;
+ }
+
+ src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO);
+ cmp = memcmp(buf, src, obj_size);
+ zs_unmap_object(zram->mem_pool, cur_handle);
+
+ if (!cmp) {
+ rb_node = zram_rbtree_search(&zram->sph_rbtree, handle);
+
+ /*
+ * This check is necessary in order not to zs_free an object
+ * that someone already refers to. This situation is possible
+ * when with repeated calls to zram_do_scan(). For example:
+ *
+ * [slot0] [slot1] [slot2] [slot3] [slot4]
+ * [obj0] [obj1] [obj2] [obj3] [obj4]
+ *
+ * Let's imagine that obj2 and obj3 are equal, and we called
+ * zram_do_scan() function:
+ *
+ * [slot0] [slot1] [slot2] [slot3] [slot4]
+ * [obj0] [obj1] [obj2] [obj2] [obj4]
+ *
+ * Now, slot2 and slot3 refers to obj2 zsmalloc object.
+ * Time passed, now slot0 refres to obj0_n, which is equal
+ * to obj2:
+ *
+ * [slot0] [slot1] [slot2] [slot3] [slot4]
+ * [obj0_n] [obj1] [obj2] [obj2] [obj4]
+ *
+ * Now we call zram_do_scan() function again. We get to slot2,
+ * and we understand that obj2 and obj0_n hashes are the same. We
+ * try to zs_free(obj2), but slot3 also already refers to it.
+ *
+ * This is not correct!
+ */
+ if (unlikely(rb_node))
+ if (rb_node->cnt > 1) {
+ zram_slot_unlock(zram, node->index);
+ continue;
+ }
+
+ zram_set_handle(zram, index, cur_handle);
+ zs_free(zram->mem_pool, handle);
+
+ rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle);
+
+ if (!rb_node) {
+ rb_node = kzalloc(sizeof(struct zram_rbtree_node),
+ GFP_KERNEL);
+ if (!rb_node) {
+ pr_err("Failed to allocate rb_node\n");
+ ret = -ENOMEM;
+ zram_slot_unlock(zram, node->index);
+ mutex_unlock(&zram_rbtree_mutex);
+ goto merged_or_err;
+ }
+
+ rb_node->key = cur_handle;
+ /* Two slots refers to an zsmalloc object with cur_handle key */
+ rb_node->cnt = 2;
+ zram_rbtree_insert(&zram->sph_rbtree, rb_node);
+ } else {
+ rb_node->cnt++;
+ }
+
+ atomic64_sub(obj_size, &zram->stats.compr_data_size);
+ zram_set_flag(zram, index, ZRAM_MERGED);
+ zram_set_flag(zram, node->index, ZRAM_MERGED);
+
+ zram_slot_unlock(zram, node->index);
+ mutex_unlock(&zram_rbtree_mutex);
+ goto merged_or_err;
+ }
+
+ zram_slot_unlock(zram, node->index);
+ }
+
+ mutex_unlock(&zram_rbtree_mutex);
+
+ node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL);
+ if (!node) {
+ ret = -ENOMEM;
+ goto merged_or_err;
+ }
+
+ node->index = index;
+ hlist_add_head(&node->next, &htable[hash % htable_size]);
+
+merged_or_err:
+ kfree(buf);
+ return ret;
+}
+
+static void zram_free_htable_entries(struct hlist_head *htable,
+ size_t htable_size)
+{
+ struct hlist_node *n;
+ struct zram_hash_node *node;
+
+ hlist_for_each_entry_safe(node, n, htable, next) {
+ hlist_del(&node->next);
+ kfree(node);
+ }
+}
+
+static int zram_do_scan(struct zram *zram)
+{
+ size_t num_pages = zram->disksize >> PAGE_SHIFT;
+ size_t htable_size = num_pages;
+ size_t index;
+ struct hlist_head *htable;
+ int i, ret = 0;
+
+ htable = vzalloc(htable_size * sizeof(struct hlist_head));
+ if (!htable) {
+ pr_err("Failed to allocate hash table\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < htable_size; i++)
+ INIT_HLIST_HEAD(&htable[i]);
+
+ for (index = 0; index < num_pages; index++) {
+ zram_slot_lock(zram, index);
+
+ if (!zram_allocated(zram, index)) {
+ zram_slot_unlock(zram, index);
+ continue;
+ }
+
+ if (zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
+ zram_test_flag(zram, index, ZRAM_WB) ||
+ zram_test_flag(zram, index, ZRAM_SAME)) {
+ zram_slot_unlock(zram, index);
+ continue;
+ }
+
+ /* Ignore pages that have been recompressed */
+ if (zram_get_priority(zram, index) != 0)
+ continue;
+
+ ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index);
+ zram_slot_unlock(zram, index);
+ if (ret != 0)
+ goto out;
+ }
+
+out:
+ zram_free_htable_entries(htable, htable_size);
+ vfree(htable);
+ return ret;
+}
+
static void zram_meta_free(struct zram *zram, u64 disksize)
{
size_t num_pages = disksize >> PAGE_SHIFT;
@@ -1324,6 +1578,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
static void zram_free_page(struct zram *zram, size_t index)
{
unsigned long handle;
+ struct zram_rbtree_node *node;

#ifdef CONFIG_ZRAM_MEMORY_TRACKING
zram->table[index].ac_time = 0;
@@ -1361,7 +1616,26 @@ static void zram_free_page(struct zram *zram, size_t index)
if (!handle)
return;

- zs_free(zram->mem_pool, handle);
+ if (zram_test_flag(zram, index, ZRAM_MERGED)) {
+ zram_clear_flag(zram, index, ZRAM_MERGED);
+ mutex_lock(&zram_rbtree_mutex);
+
+ node = zram_rbtree_search(&zram->sph_rbtree, handle);
+ BUG_ON(!node);
+
+ node->cnt--;
+ if (node->cnt == 0) {
+ rb_erase(&node->node, &zram->sph_rbtree);
+ mutex_unlock(&zram_rbtree_mutex);
+
+ zs_free(zram->mem_pool, handle);
+ kfree(node);
+ } else {
+ mutex_unlock(&zram_rbtree_mutex);
+ }
+ } else {
+ zs_free(zram->mem_pool, handle);
+ }

atomic64_sub(zram_get_obj_size(zram, index),
&zram->stats.compr_data_size);
@@ -2421,6 +2695,8 @@ static int zram_add(void)

comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);

+ zram->sph_rbtree = RB_ROOT;
+
zram_debugfs_register(zram);
pr_info("Added device: %s\n", zram->disk->disk_name);
return device_id;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index c5254626f051..4a7151c94523 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -56,6 +56,7 @@ enum zram_pageflags {

ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */
ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */
+ ZRAM_MERGED, /* page was merged */

__NR_ZRAM_PAGEFLAGS,
};
@@ -140,5 +141,10 @@ struct zram {
#ifdef CONFIG_ZRAM_MEMORY_TRACKING
struct dentry *debugfs_dir;
#endif
+ /*
+ * This is same pages handle's rb tree, where the key is a handle
+ * to same pages and the value is a link counter
+ */
+ struct rb_root sph_rbtree;
};
#endif
--
2.25.1



2022-11-23 10:04:57

by Chen Wandun

[permalink] [raw]
Subject: Re: [RFC PATCH v1 1/4] zram: introduce merge identical pages mechanism



在 2022/11/22 3:00, Alexey Romanov 写道:
> zram maps each page (struct page) to a zsmalloc object that
> contains a compressed buffer of that page's data. This fact
> generates data redundancy: for example, two identical pages
> will be store in compressed form in zsmalloc allocator twice.
>
> This patch adds a mechanism to scan zram_table_entry array
> and frees all identical objects in zsmalloc allocator, leaving
> only one. All zram_table_entry elements which reference this
> freed objects now refer to the same, not freed, object.
>
> To implement this mechanism, we sequentially scan the
> zram_table_entry array, counting the hash from the contents of
> the compressed pages (zsmalloc objects) and enter the index of
> the object into the hash table (hlist_head). If the hash matches,
> we remove the identical zsmalloc (zs_free) objects and update
> the link rbtree.
>
> Also, we can't just call zs_free() function during zram_free_page().
> Before calling this function we have to make sure that no one else
> refers to this zsmalloc object.
>
> To implement the mechanism for merging identical compressed
> pages (zsmalloc objects), a rbtree is needed.
>
> The tree node key is a reference to the zsmalloc object
> (handle), and the value is the number of references to
> this object (atomic counter). This is necessary for data
> consistency so that we do not zs_free the object referenced
> by any element of the zram_table_entry array.
>
> Signed-off-by: Alexey Romanov <[email protected]>
> ---
> drivers/block/zram/zram_drv.c | 278 +++++++++++++++++++++++++++++++++-
> drivers/block/zram/zram_drv.h | 6 +
> 2 files changed, 283 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index e290d6d97047..716c2f72805e 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -33,12 +33,15 @@
> #include <linux/debugfs.h>
> #include <linux/cpuhotplug.h>
> #include <linux/part_stat.h>
> +#include <linux/hashtable.h>
> +#include <linux/xxhash.h>
>
> #include "zram_drv.h"
>
> static DEFINE_IDR(zram_index_idr);
> /* idr index must be protected */
> static DEFINE_MUTEX(zram_index_mutex);
> +static DEFINE_MUTEX(zram_rbtree_mutex);
>
> static int zram_major;
> static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
> @@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index);
> static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
> u32 index, int offset, struct bio *bio);
>
> +struct zram_rbtree_node {
> + struct rb_node node;
> + unsigned long key;
> + unsigned long cnt;
> +};
> +
> +struct zram_hash_node {
> + unsigned long index;
> + struct hlist_node next;
> +};
>
> static int zram_slot_trylock(struct zram *zram, u32 index)
> {
> @@ -1283,6 +1296,247 @@ static DEVICE_ATTR_RO(bd_stat);
> #endif
> static DEVICE_ATTR_RO(debug_stat);
>
> +static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data)
> +{
> + struct rb_node **new = &(root->rb_node), *parent = NULL;
> + struct zram_rbtree_node *this;
> +
> + while (*new) {
> + this = rb_entry(*new, struct zram_rbtree_node, node);
> + parent = *new;
> + if (data->key < this->key)
> + new = &((*new)->rb_left);
> + else if (data->key > this->key)
> + new = &((*new)->rb_right);
> + else
> + return false;
> + }
> +
> + rb_link_node(&data->node, parent, new);
> + rb_insert_color(&data->node, root);
> + return true;
> +}
> +
> +static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root,
> + unsigned long key)
> +{
> + struct rb_node *node = root->rb_node;
> + struct zram_rbtree_node *data;
> +
> + while (node) {
> + data = rb_entry(node, struct zram_rbtree_node, node);
> + if (key < data->key)
> + node = node->rb_left;
> + else if (key > data->key)
> + node = node->rb_right;
> + else
> + return data;
> + }
> +
> + return NULL;
> +}
> +
> +static unsigned long zram_calc_hash(void *src, size_t len)
> +{
> + return xxhash(src, len, 0);
> +}
> +
> +static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable,
> + size_t htable_size, size_t index)
> +{
> + struct zram_rbtree_node *rb_node;
> + struct zram_hash_node *node;
> + unsigned long handle, cur_handle;
> + size_t obj_size;
> + char *src, *buf;
> + unsigned long hash;
> + int ret = 0;
> +
> + handle = zram_get_handle(zram, index);
> + if (!handle)
> + return ret;
> +
> + obj_size = zram_get_obj_size(zram, index);
> + buf = kmalloc(obj_size, GFP_KERNEL);
> + if (!buf) {
> + pr_err("Failed to allocate zs_map_object buffer\n");
> + return -ENOMEM;
> + }
> +
> + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
> + memcpy(buf, src, obj_size);
> + zs_unmap_object(zram->mem_pool, handle);
> + hash = zram_calc_hash(buf, obj_size);
> +
> + mutex_lock(&zram_rbtree_mutex);
> + hlist_for_each_entry(node, &htable[hash % htable_size], next) {
> + int cmp;
> +
> + zram_slot_lock(zram, node->index);
> +
> + /*
> + * Page may change as the hash table is being formed,
> + * so the checks below are necessary.
> + */
> + cur_handle = zram_get_handle(zram, node->index);
> + if (handle == cur_handle ||
> + obj_size != zram_get_obj_size(zram, node->index)) {
> + zram_slot_unlock(zram, node->index);
> + continue;
> + }
> +
> + src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO);
> + cmp = memcmp(buf, src, obj_size);
> + zs_unmap_object(zram->mem_pool, cur_handle);
> +
> + if (!cmp) {
> + rb_node = zram_rbtree_search(&zram->sph_rbtree, handle);
> +
> + /*
> + * This check is necessary in order not to zs_free an object
> + * that someone already refers to. This situation is possible
> + * when with repeated calls to zram_do_scan(). For example:
> + *
> + * [slot0] [slot1] [slot2] [slot3] [slot4]
> + * [obj0] [obj1] [obj2] [obj3] [obj4]
> + *
> + * Let's imagine that obj2 and obj3 are equal, and we called
> + * zram_do_scan() function:
> + *
> + * [slot0] [slot1] [slot2] [slot3] [slot4]
> + * [obj0] [obj1] [obj2] [obj2] [obj4]
> + *
> + * Now, slot2 and slot3 refers to obj2 zsmalloc object.
> + * Time passed, now slot0 refres to obj0_n, which is equal
> + * to obj2:
> + *
> + * [slot0] [slot1] [slot2] [slot3] [slot4]
> + * [obj0_n] [obj1] [obj2] [obj2] [obj4]
> + *
> + * Now we call zram_do_scan() function again. We get to slot2,
> + * and we understand that obj2 and obj0_n hashes are the same. We
> + * try to zs_free(obj2), but slot3 also already refers to it.
> + *
> + * This is not correct!
> + */
In this case, it seem like can't merge all same object, is that right?

how about let slot2 point to obj0_n and decrease the rb_node ref of
slot2/slot3 in the first loop,
let slot3 point to obj0_n and decrease the rb_node ref in the next
loop,  then the obj2 can be free.
> + if (unlikely(rb_node))
> + if (rb_node->cnt > 1) {
> + zram_slot_unlock(zram, node->index);
> + continue;
> + }
> +
> + zram_set_handle(zram, index, cur_handle);
> + zs_free(zram->mem_pool, handle);
> +
> + rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle);
> +
> + if (!rb_node) {
> + rb_node = kzalloc(sizeof(struct zram_rbtree_node),
> + GFP_KERNEL);
> + if (!rb_node) {
> + pr_err("Failed to allocate rb_node\n");
> + ret = -ENOMEM;
> + zram_slot_unlock(zram, node->index);
> + mutex_unlock(&zram_rbtree_mutex);
> + goto merged_or_err;
> + }
> +
> + rb_node->key = cur_handle;
> + /* Two slots refers to an zsmalloc object with cur_handle key */
> + rb_node->cnt = 2;
> + zram_rbtree_insert(&zram->sph_rbtree, rb_node);
> + } else {
> + rb_node->cnt++;
> + }
> +
> + atomic64_sub(obj_size, &zram->stats.compr_data_size);
> + zram_set_flag(zram, index, ZRAM_MERGED);
> + zram_set_flag(zram, node->index, ZRAM_MERGED);
> +
> + zram_slot_unlock(zram, node->index);
> + mutex_unlock(&zram_rbtree_mutex);
> + goto merged_or_err;
> + }
> +
> + zram_slot_unlock(zram, node->index);
> + }
> +
> + mutex_unlock(&zram_rbtree_mutex);
> +
> + node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL);
> + if (!node) {
> + ret = -ENOMEM;
> + goto merged_or_err;
> + }
> +
> + node->index = index;
> + hlist_add_head(&node->next, &htable[hash % htable_size]);
> +
> +merged_or_err:
> + kfree(buf);
> + return ret;
> +}
> +
> +static void zram_free_htable_entries(struct hlist_head *htable,
> + size_t htable_size)
> +{
> + struct hlist_node *n;
> + struct zram_hash_node *node;
> +
> + hlist_for_each_entry_safe(node, n, htable, next) {
> + hlist_del(&node->next);
> + kfree(node);
> + }
> +}
> +
> +static int zram_do_scan(struct zram *zram)
> +{
> + size_t num_pages = zram->disksize >> PAGE_SHIFT;
> + size_t htable_size = num_pages;
> + size_t index;
> + struct hlist_head *htable;
> + int i, ret = 0;
> +
> + htable = vzalloc(htable_size * sizeof(struct hlist_head));
> + if (!htable) {
> + pr_err("Failed to allocate hash table\n");
> + return -ENOMEM;
> + }
> +
> + for (i = 0; i < htable_size; i++)
> + INIT_HLIST_HEAD(&htable[i]);
> +
> + for (index = 0; index < num_pages; index++) {
> + zram_slot_lock(zram, index);
> +
> + if (!zram_allocated(zram, index)) {
> + zram_slot_unlock(zram, index);
> + continue;
> + }
> +
> + if (zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
> + zram_test_flag(zram, index, ZRAM_WB) ||
> + zram_test_flag(zram, index, ZRAM_SAME)) {
> + zram_slot_unlock(zram, index);
> + continue;
> + }
> +
> + /* Ignore pages that have been recompressed */
> + if (zram_get_priority(zram, index) != 0)
> + continue;
> +
> + ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index);
> + zram_slot_unlock(zram, index);
> + if (ret != 0)
> + goto out;
> + }
> +
> +out:
> + zram_free_htable_entries(htable, htable_size);
> + vfree(htable);
> + return ret;
> +}
> +
> static void zram_meta_free(struct zram *zram, u64 disksize)
> {
> size_t num_pages = disksize >> PAGE_SHIFT;
> @@ -1324,6 +1578,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
> static void zram_free_page(struct zram *zram, size_t index)
> {
> unsigned long handle;
> + struct zram_rbtree_node *node;
>
> #ifdef CONFIG_ZRAM_MEMORY_TRACKING
> zram->table[index].ac_time = 0;
> @@ -1361,7 +1616,26 @@ static void zram_free_page(struct zram *zram, size_t index)
> if (!handle)
> return;
>
> - zs_free(zram->mem_pool, handle);
> + if (zram_test_flag(zram, index, ZRAM_MERGED)) {
> + zram_clear_flag(zram, index, ZRAM_MERGED);
> + mutex_lock(&zram_rbtree_mutex);
> +
> + node = zram_rbtree_search(&zram->sph_rbtree, handle);
> + BUG_ON(!node);
> +
> + node->cnt--;
> + if (node->cnt == 0) {
> + rb_erase(&node->node, &zram->sph_rbtree);
> + mutex_unlock(&zram_rbtree_mutex);
> +
> + zs_free(zram->mem_pool, handle);
> + kfree(node);
> + } else {
> + mutex_unlock(&zram_rbtree_mutex);
> + }
> + } else {
> + zs_free(zram->mem_pool, handle);
> + }
>
> atomic64_sub(zram_get_obj_size(zram, index),
> &zram->stats.compr_data_size);
> @@ -2421,6 +2695,8 @@ static int zram_add(void)
>
> comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
>
> + zram->sph_rbtree = RB_ROOT;
> +
> zram_debugfs_register(zram);
> pr_info("Added device: %s\n", zram->disk->disk_name);
> return device_id;
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index c5254626f051..4a7151c94523 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -56,6 +56,7 @@ enum zram_pageflags {
>
> ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */
> ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */
> + ZRAM_MERGED, /* page was merged */
>
> __NR_ZRAM_PAGEFLAGS,
> };
> @@ -140,5 +141,10 @@ struct zram {
> #ifdef CONFIG_ZRAM_MEMORY_TRACKING
> struct dentry *debugfs_dir;
> #endif
> + /*
> + * This is same pages handle's rb tree, where the key is a handle
> + * to same pages and the value is a link counter
> + */
> + struct rb_root sph_rbtree;
> };
> #endif

2022-11-23 10:53:55

by Alexey Romanov

[permalink] [raw]
Subject: Re: [RFC PATCH v1 1/4] zram: introduce merge identical pages mechanism

On Wed, Nov 23, 2022 at 04:25:00PM +0800, Chen Wandun wrote:
>
>
> 在 2022/11/22 3:00, Alexey Romanov 写道:
> > zram maps each page (struct page) to a zsmalloc object that
> > contains a compressed buffer of that page's data. This fact
> > generates data redundancy: for example, two identical pages
> > will be store in compressed form in zsmalloc allocator twice.
> >
> > This patch adds a mechanism to scan zram_table_entry array
> > and frees all identical objects in zsmalloc allocator, leaving
> > only one. All zram_table_entry elements which reference this
> > freed objects now refer to the same, not freed, object.
> >
> > To implement this mechanism, we sequentially scan the
> > zram_table_entry array, counting the hash from the contents of
> > the compressed pages (zsmalloc objects) and enter the index of
> > the object into the hash table (hlist_head). If the hash matches,
> > we remove the identical zsmalloc (zs_free) objects and update
> > the link rbtree.
> >
> > Also, we can't just call zs_free() function during zram_free_page().
> > Before calling this function we have to make sure that no one else
> > refers to this zsmalloc object.
> >
> > To implement the mechanism for merging identical compressed
> > pages (zsmalloc objects), a rbtree is needed.
> >
> > The tree node key is a reference to the zsmalloc object
> > (handle), and the value is the number of references to
> > this object (atomic counter). This is necessary for data
> > consistency so that we do not zs_free the object referenced
> > by any element of the zram_table_entry array.
> >
> > Signed-off-by: Alexey Romanov <[email protected]>
> > ---
> > drivers/block/zram/zram_drv.c | 278 +++++++++++++++++++++++++++++++++-
> > drivers/block/zram/zram_drv.h | 6 +
> > 2 files changed, 283 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index e290d6d97047..716c2f72805e 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -33,12 +33,15 @@
> > #include <linux/debugfs.h>
> > #include <linux/cpuhotplug.h>
> > #include <linux/part_stat.h>
> > +#include <linux/hashtable.h>
> > +#include <linux/xxhash.h>
> > #include "zram_drv.h"
> > static DEFINE_IDR(zram_index_idr);
> > /* idr index must be protected */
> > static DEFINE_MUTEX(zram_index_mutex);
> > +static DEFINE_MUTEX(zram_rbtree_mutex);
> > static int zram_major;
> > static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
> > @@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index);
> > static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
> > u32 index, int offset, struct bio *bio);
> > +struct zram_rbtree_node {
> > + struct rb_node node;
> > + unsigned long key;
> > + unsigned long cnt;
> > +};
> > +
> > +struct zram_hash_node {
> > + unsigned long index;
> > + struct hlist_node next;
> > +};
> > static int zram_slot_trylock(struct zram *zram, u32 index)
> > {
> > @@ -1283,6 +1296,247 @@ static DEVICE_ATTR_RO(bd_stat);
> > #endif
> > static DEVICE_ATTR_RO(debug_stat);
> > +static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data)
> > +{
> > + struct rb_node **new = &(root->rb_node), *parent = NULL;
> > + struct zram_rbtree_node *this;
> > +
> > + while (*new) {
> > + this = rb_entry(*new, struct zram_rbtree_node, node);
> > + parent = *new;
> > + if (data->key < this->key)
> > + new = &((*new)->rb_left);
> > + else if (data->key > this->key)
> > + new = &((*new)->rb_right);
> > + else
> > + return false;
> > + }
> > +
> > + rb_link_node(&data->node, parent, new);
> > + rb_insert_color(&data->node, root);
> > + return true;
> > +}
> > +
> > +static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root,
> > + unsigned long key)
> > +{
> > + struct rb_node *node = root->rb_node;
> > + struct zram_rbtree_node *data;
> > +
> > + while (node) {
> > + data = rb_entry(node, struct zram_rbtree_node, node);
> > + if (key < data->key)
> > + node = node->rb_left;
> > + else if (key > data->key)
> > + node = node->rb_right;
> > + else
> > + return data;
> > + }
> > +
> > + return NULL;
> > +}
> > +
> > +static unsigned long zram_calc_hash(void *src, size_t len)
> > +{
> > + return xxhash(src, len, 0);
> > +}
> > +
> > +static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable,
> > + size_t htable_size, size_t index)
> > +{
> > + struct zram_rbtree_node *rb_node;
> > + struct zram_hash_node *node;
> > + unsigned long handle, cur_handle;
> > + size_t obj_size;
> > + char *src, *buf;
> > + unsigned long hash;
> > + int ret = 0;
> > +
> > + handle = zram_get_handle(zram, index);
> > + if (!handle)
> > + return ret;
> > +
> > + obj_size = zram_get_obj_size(zram, index);
> > + buf = kmalloc(obj_size, GFP_KERNEL);
> > + if (!buf) {
> > + pr_err("Failed to allocate zs_map_object buffer\n");
> > + return -ENOMEM;
> > + }
> > +
> > + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
> > + memcpy(buf, src, obj_size);
> > + zs_unmap_object(zram->mem_pool, handle);
> > + hash = zram_calc_hash(buf, obj_size);
> > +
> > + mutex_lock(&zram_rbtree_mutex);
> > + hlist_for_each_entry(node, &htable[hash % htable_size], next) {
> > + int cmp;
> > +
> > + zram_slot_lock(zram, node->index);
> > +
> > + /*
> > + * Page may change as the hash table is being formed,
> > + * so the checks below are necessary.
> > + */
> > + cur_handle = zram_get_handle(zram, node->index);
> > + if (handle == cur_handle ||
> > + obj_size != zram_get_obj_size(zram, node->index)) {
> > + zram_slot_unlock(zram, node->index);
> > + continue;
> > + }
> > +
> > + src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO);
> > + cmp = memcmp(buf, src, obj_size);
> > + zs_unmap_object(zram->mem_pool, cur_handle);
> > +
> > + if (!cmp) {
> > + rb_node = zram_rbtree_search(&zram->sph_rbtree, handle);
> > +
> > + /*
> > + * This check is necessary in order not to zs_free an object
> > + * that someone already refers to. This situation is possible
> > + * when with repeated calls to zram_do_scan(). For example:
> > + *
> > + * [slot0] [slot1] [slot2] [slot3] [slot4]
> > + * [obj0] [obj1] [obj2] [obj3] [obj4]
> > + *
> > + * Let's imagine that obj2 and obj3 are equal, and we called
> > + * zram_do_scan() function:
> > + *
> > + * [slot0] [slot1] [slot2] [slot3] [slot4]
> > + * [obj0] [obj1] [obj2] [obj2] [obj4]
> > + *
> > + * Now, slot2 and slot3 refers to obj2 zsmalloc object.
> > + * Time passed, now slot0 refres to obj0_n, which is equal
> > + * to obj2:
> > + *
> > + * [slot0] [slot1] [slot2] [slot3] [slot4]
> > + * [obj0_n] [obj1] [obj2] [obj2] [obj4]
> > + *
> > + * Now we call zram_do_scan() function again. We get to slot2,
> > + * and we understand that obj2 and obj0_n hashes are the same. We
> > + * try to zs_free(obj2), but slot3 also already refers to it.
> > + *
> > + * This is not correct!
> > + */
> In this case, it seem like can't merge all same object, is that right?

Yes.

>
> how about let slot2 point to obj0_n and decrease the rb_node ref of
> slot2/slot3 in the first loop,
> let slot3 point to obj0_n and decrease the rb_node ref in the next loop, 
> then the obj2 can be free.

Sure. I will correct your remark in next series.

> > + if (unlikely(rb_node))
> > + if (rb_node->cnt > 1) {
> > + zram_slot_unlock(zram, node->index);
> > + continue;
> > + }
> > +
> > + zram_set_handle(zram, index, cur_handle);
> > + zs_free(zram->mem_pool, handle);
> > +
> > + rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle);
> > +
> > + if (!rb_node) {
> > + rb_node = kzalloc(sizeof(struct zram_rbtree_node),
> > + GFP_KERNEL);
> > + if (!rb_node) {
> > + pr_err("Failed to allocate rb_node\n");
> > + ret = -ENOMEM;
> > + zram_slot_unlock(zram, node->index);
> > + mutex_unlock(&zram_rbtree_mutex);
> > + goto merged_or_err;
> > + }
> > +
> > + rb_node->key = cur_handle;
> > + /* Two slots refers to an zsmalloc object with cur_handle key */
> > + rb_node->cnt = 2;
> > + zram_rbtree_insert(&zram->sph_rbtree, rb_node);
> > + } else {
> > + rb_node->cnt++;
> > + }
> > +
> > + atomic64_sub(obj_size, &zram->stats.compr_data_size);
> > + zram_set_flag(zram, index, ZRAM_MERGED);
> > + zram_set_flag(zram, node->index, ZRAM_MERGED);
> > +
> > + zram_slot_unlock(zram, node->index);
> > + mutex_unlock(&zram_rbtree_mutex);
> > + goto merged_or_err;
> > + }
> > +
> > + zram_slot_unlock(zram, node->index);
> > + }
> > +
> > + mutex_unlock(&zram_rbtree_mutex);
> > +
> > + node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL);
> > + if (!node) {
> > + ret = -ENOMEM;
> > + goto merged_or_err;
> > + }
> > +
> > + node->index = index;
> > + hlist_add_head(&node->next, &htable[hash % htable_size]);
> > +
> > +merged_or_err:
> > + kfree(buf);
> > + return ret;
> > +}
> > +
> > +static void zram_free_htable_entries(struct hlist_head *htable,
> > + size_t htable_size)
> > +{
> > + struct hlist_node *n;
> > + struct zram_hash_node *node;
> > +
> > + hlist_for_each_entry_safe(node, n, htable, next) {
> > + hlist_del(&node->next);
> > + kfree(node);
> > + }
> > +}
> > +
> > +static int zram_do_scan(struct zram *zram)
> > +{
> > + size_t num_pages = zram->disksize >> PAGE_SHIFT;
> > + size_t htable_size = num_pages;
> > + size_t index;
> > + struct hlist_head *htable;
> > + int i, ret = 0;
> > +
> > + htable = vzalloc(htable_size * sizeof(struct hlist_head));
> > + if (!htable) {
> > + pr_err("Failed to allocate hash table\n");
> > + return -ENOMEM;
> > + }
> > +
> > + for (i = 0; i < htable_size; i++)
> > + INIT_HLIST_HEAD(&htable[i]);
> > +
> > + for (index = 0; index < num_pages; index++) {
> > + zram_slot_lock(zram, index);
> > +
> > + if (!zram_allocated(zram, index)) {
> > + zram_slot_unlock(zram, index);
> > + continue;
> > + }
> > +
> > + if (zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
> > + zram_test_flag(zram, index, ZRAM_WB) ||
> > + zram_test_flag(zram, index, ZRAM_SAME)) {
> > + zram_slot_unlock(zram, index);
> > + continue;
> > + }
> > +
> > + /* Ignore pages that have been recompressed */
> > + if (zram_get_priority(zram, index) != 0)
> > + continue;
> > +
> > + ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index);
> > + zram_slot_unlock(zram, index);
> > + if (ret != 0)
> > + goto out;
> > + }
> > +
> > +out:
> > + zram_free_htable_entries(htable, htable_size);
> > + vfree(htable);
> > + return ret;
> > +}
> > +
> > static void zram_meta_free(struct zram *zram, u64 disksize)
> > {
> > size_t num_pages = disksize >> PAGE_SHIFT;
> > @@ -1324,6 +1578,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
> > static void zram_free_page(struct zram *zram, size_t index)
> > {
> > unsigned long handle;
> > + struct zram_rbtree_node *node;
> > #ifdef CONFIG_ZRAM_MEMORY_TRACKING
> > zram->table[index].ac_time = 0;
> > @@ -1361,7 +1616,26 @@ static void zram_free_page(struct zram *zram, size_t index)
> > if (!handle)
> > return;
> > - zs_free(zram->mem_pool, handle);
> > + if (zram_test_flag(zram, index, ZRAM_MERGED)) {
> > + zram_clear_flag(zram, index, ZRAM_MERGED);
> > + mutex_lock(&zram_rbtree_mutex);
> > +
> > + node = zram_rbtree_search(&zram->sph_rbtree, handle);
> > + BUG_ON(!node);
> > +
> > + node->cnt--;
> > + if (node->cnt == 0) {
> > + rb_erase(&node->node, &zram->sph_rbtree);
> > + mutex_unlock(&zram_rbtree_mutex);
> > +
> > + zs_free(zram->mem_pool, handle);
> > + kfree(node);
> > + } else {
> > + mutex_unlock(&zram_rbtree_mutex);
> > + }
> > + } else {
> > + zs_free(zram->mem_pool, handle);
> > + }
> > atomic64_sub(zram_get_obj_size(zram, index),
> > &zram->stats.compr_data_size);
> > @@ -2421,6 +2695,8 @@ static int zram_add(void)
> > comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
> > + zram->sph_rbtree = RB_ROOT;
> > +
> > zram_debugfs_register(zram);
> > pr_info("Added device: %s\n", zram->disk->disk_name);
> > return device_id;
> > diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> > index c5254626f051..4a7151c94523 100644
> > --- a/drivers/block/zram/zram_drv.h
> > +++ b/drivers/block/zram/zram_drv.h
> > @@ -56,6 +56,7 @@ enum zram_pageflags {
> > ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */
> > ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */
> > + ZRAM_MERGED, /* page was merged */
> > __NR_ZRAM_PAGEFLAGS,
> > };
> > @@ -140,5 +141,10 @@ struct zram {
> > #ifdef CONFIG_ZRAM_MEMORY_TRACKING
> > struct dentry *debugfs_dir;
> > #endif
> > + /*
> > + * This is same pages handle's rb tree, where the key is a handle
> > + * to same pages and the value is a link counter
> > + */
> > + struct rb_root sph_rbtree;
> > };
> > #endif
>

--
Thank you,
Alexey