From: Joonsoo Kim <[email protected]>
Changes from v1
o reogranize dedup specific functions
o support Kconfig on/off
o update zram documents
o compare all the entries with same checksum (patch #4)
This patchset implements deduplication feature in zram. Motivation
is to save memory usage by zram. There are detailed description
about motivation and experimental results on patch #2 so please
refer it.
Thanks.
Joonsoo Kim (4):
zram: introduce zram_entry to prepare dedup functionality
zram: implement deduplication in zram
zram: make deduplication feature optional
zram: compare all the entries with same checksum for deduplication
Documentation/ABI/testing/sysfs-block-zram | 10 +
Documentation/blockdev/zram.txt | 3 +
drivers/block/zram/Kconfig | 14 ++
drivers/block/zram/Makefile | 3 +-
drivers/block/zram/zram_dedup.c | 288 +++++++++++++++++++++++++++++
drivers/block/zram/zram_dedup.h | 56 ++++++
drivers/block/zram/zram_drv.c | 170 +++++++++++++----
drivers/block/zram/zram_drv.h | 25 ++-
8 files changed, 535 insertions(+), 34 deletions(-)
create mode 100644 drivers/block/zram/zram_dedup.c
create mode 100644 drivers/block/zram/zram_dedup.h
--
2.7.4
From: Joonsoo Kim <[email protected]>
Following patch will implement deduplication functionality
in the zram and it requires an indirection layer to manage
the life cycle of zsmalloc handle. To prepare that, this patch
introduces zram_entry which can be used to manage the life-cycle
of zsmalloc handle. Many lines are changed due to rename but
core change is just simple introduction about newly data structure.
Signed-off-by: Joonsoo Kim <[email protected]>
---
drivers/block/zram/zram_drv.c | 82 +++++++++++++++++++++++++++++--------------
drivers/block/zram/zram_drv.h | 6 +++-
2 files changed, 60 insertions(+), 28 deletions(-)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 0194441..f3949da 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -419,6 +419,32 @@ static DEVICE_ATTR_RO(io_stat);
static DEVICE_ATTR_RO(mm_stat);
static DEVICE_ATTR_RO(debug_stat);
+static struct zram_entry *zram_entry_alloc(struct zram *zram,
+ unsigned int len, gfp_t flags)
+{
+ struct zram_meta *meta = zram->meta;
+ struct zram_entry *entry;
+
+ entry = kzalloc(sizeof(*entry), flags);
+ if (!entry)
+ return NULL;
+
+ entry->handle = zs_malloc(meta->mem_pool, len, flags);
+ if (!entry->handle) {
+ kfree(entry);
+ return NULL;
+ }
+
+ return entry;
+}
+
+static inline void zram_entry_free(struct zram_meta *meta,
+ struct zram_entry *entry)
+{
+ zs_free(meta->mem_pool, entry->handle);
+ kfree(entry);
+}
+
static void zram_meta_free(struct zram_meta *meta, u64 disksize)
{
size_t num_pages = disksize >> PAGE_SHIFT;
@@ -426,15 +452,15 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)
/* Free all pages that are still in this zram device */
for (index = 0; index < num_pages; index++) {
- unsigned long handle = meta->table[index].handle;
+ struct zram_entry *entry = meta->table[index].entry;
/*
* No memory is allocated for same element filled pages.
* Simply clear same page flag.
*/
- if (!handle || zram_test_flag(meta, index, ZRAM_SAME))
+ if (!entry || zram_test_flag(meta, index, ZRAM_SAME))
continue;
- zs_free(meta->mem_pool, handle);
+ zram_entry_free(meta, entry);
}
zs_destroy_pool(meta->mem_pool);
@@ -479,7 +505,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
static void zram_free_page(struct zram *zram, size_t index)
{
struct zram_meta *meta = zram->meta;
- unsigned long handle = meta->table[index].handle;
+ struct zram_entry *entry = meta->table[index].entry;
/*
* No memory is allocated for same element filled pages.
@@ -492,16 +518,16 @@ static void zram_free_page(struct zram *zram, size_t index)
return;
}
- if (!handle)
+ if (!entry)
return;
- zs_free(meta->mem_pool, handle);
+ zram_entry_free(meta, entry);
atomic64_sub(zram_get_obj_size(meta, index),
&zram->stats.compr_data_size);
atomic64_dec(&zram->stats.pages_stored);
- meta->table[index].handle = 0;
+ meta->table[index].entry = NULL;
zram_set_obj_size(meta, index, 0);
}
@@ -510,20 +536,20 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
int ret = 0;
unsigned char *cmem;
struct zram_meta *meta = zram->meta;
- unsigned long handle;
+ struct zram_entry *entry;
unsigned int size;
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
- handle = meta->table[index].handle;
+ entry = meta->table[index].entry;
size = zram_get_obj_size(meta, index);
- if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) {
+ if (!entry || zram_test_flag(meta, index, ZRAM_SAME)) {
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
zram_fill_page(mem, PAGE_SIZE, meta->table[index].element);
return 0;
}
- cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+ cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_RO);
if (size == PAGE_SIZE) {
copy_page(mem, cmem);
} else {
@@ -532,7 +558,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
ret = zcomp_decompress(zstrm, cmem, size, mem);
zcomp_stream_put(zram->comp);
}
- zs_unmap_object(meta->mem_pool, handle);
+ zs_unmap_object(meta->mem_pool, entry->handle);
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
/* Should NEVER happen. Return bio error if it does. */
@@ -554,7 +580,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
page = bvec->bv_page;
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
- if (unlikely(!meta->table[index].handle) ||
+ if (unlikely(!meta->table[index].entry) ||
zram_test_flag(meta, index, ZRAM_SAME)) {
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
handle_same_page(bvec, meta->table[index].element);
@@ -599,7 +625,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
{
int ret = 0;
unsigned int clen;
- unsigned long handle = 0;
+ struct zram_entry *entry = NULL;
struct page *page;
unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
struct zram_meta *meta = zram->meta;
@@ -670,34 +696,36 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
}
/*
- * handle allocation has 2 paths:
+ * entry allocation has 2 paths:
* a) fast path is executed with preemption disabled (for
* per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
* since we can't sleep;
* b) slow path enables preemption and attempts to allocate
* the page with __GFP_DIRECT_RECLAIM bit set. we have to
* put per-cpu compression stream and, thus, to re-do
- * the compression once handle is allocated.
+ * the compression once entry is allocated.
*
- * if we have a 'non-null' handle here then we are coming
- * from the slow path and handle has already been allocated.
+ * if we have a 'non-null' entry here then we are coming
+ * from the slow path and entry has already been allocated.
*/
- if (!handle)
- handle = zs_malloc(meta->mem_pool, clen,
+ if (!entry) {
+ entry = zram_entry_alloc(zram, clen,
__GFP_KSWAPD_RECLAIM |
__GFP_NOWARN |
__GFP_HIGHMEM |
__GFP_MOVABLE);
- if (!handle) {
+ }
+
+ if (!entry) {
zcomp_stream_put(zram->comp);
zstrm = NULL;
atomic64_inc(&zram->stats.writestall);
- handle = zs_malloc(meta->mem_pool, clen,
+ entry = zram_entry_alloc(zram, clen,
GFP_NOIO | __GFP_HIGHMEM |
__GFP_MOVABLE);
- if (handle)
+ if (entry)
goto compress_again;
pr_err("Error allocating memory for compressed page: %u, size=%u\n",
@@ -710,12 +738,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
update_used_max(zram, alloced_pages);
if (zram->limit_pages && alloced_pages > zram->limit_pages) {
- zs_free(meta->mem_pool, handle);
+ zram_entry_free(meta, entry);
ret = -ENOMEM;
goto out;
}
- cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+ cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_WO);
if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
src = kmap_atomic(page);
@@ -727,7 +755,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
zcomp_stream_put(zram->comp);
zstrm = NULL;
- zs_unmap_object(meta->mem_pool, handle);
+ zs_unmap_object(meta->mem_pool, entry->handle);
/*
* Free memory associated with this sector
@@ -736,7 +764,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
zram_free_page(zram, index);
- meta->table[index].handle = handle;
+ meta->table[index].entry = entry;
zram_set_obj_size(meta, index, clen);
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index caeff51..a7ae46c 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -69,10 +69,14 @@ enum zram_pageflags {
/*-- Data structures */
+struct zram_entry {
+ unsigned long handle;
+};
+
/* Allocated for each disk page */
struct zram_table_entry {
union {
- unsigned long handle;
+ struct zram_entry *entry;
unsigned long element;
};
unsigned long value;
--
2.7.4
From: Joonsoo Kim <[email protected]>
This patch implements deduplication feature in zram. The purpose
of this work is naturally to save amount of memory usage by zram.
Android is one of the biggest users to use zram as swap and it's
really important to save amount of memory usage. There is a paper
that reports that duplication ratio of Android's memory content is
rather high [1]. And, there is a similar work on zswap that also
reports that experiments has shown that around 10-15% of pages
stored in zswp are duplicates and deduplicate them provides some
benefits [2].
Also, there is a different kind of workload that uses zram as blockdev
and store build outputs into it to reduce wear-out problem of real
blockdev. In this workload, deduplication hit is very high due to
temporary files and intermediate object files. Detailed analysis is
on the bottom of this description.
Anyway, if we can detect duplicated content and avoid to store duplicated
content at different memory space, we can save memory. This patch
tries to do that.
Implementation is almost simple and intuitive but I should note
one thing about implementation detail.
To check duplication, this patch uses checksum of the page and
collision of this checksum could be possible. There would be
many choices to handle this situation but this patch chooses
to allow entry with duplicated checksum to be added to the hash,
but, not to compare all entries with duplicated checksum
when checking duplication. I guess that checksum collision is quite
rare event and we don't need to pay any attention to such a case.
Therefore, I decided the most simplest way to implement the feature.
If there is a different opinion, I can accept and go that way.
Following is the result of this patch.
Test result #1 (Swap):
Android Marshmallow, emulator, x86_64, Backporting to kernel v3.18
orig_data_size: 145297408
compr_data_size: 32408125
mem_used_total: 32276480
dup_data_size: 3188134
meta_data_size: 1444272
Last two metrics added to mm_stat are related to this work.
First one, dup_data_size, is amount of saved memory by avoiding
to store duplicated page. Later one, meta_data_size, is the amount of
data structure to support deduplication. If dup > meta, we can judge
that the patch improves memory usage.
In Adnroid, we can save 5% of memory usage by this work.
Test result #2 (Blockdev):
build the kernel and store output to ext4 FS on zram
<no-dedup>
Elapsed time: 249 s
mm_stat: 430845952 191014886 196898816 0 196898816 28320 0 0 0
<dedup>
Elapsed time: 250 s
mm_stat: 430505984 190971334 148365312 0 148365312 28404 0 47287038 3945792
There is no performance degradation and save 23% memory.
Test result #3 (Blockdev):
copy android build output dir(out/host) to ext4 FS on zram
<no-dedup>
Elapsed time: out/host: 88 s
mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
<dedup>
Elapsed time: out/host: 100 s
mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336
It shows performance degradation roughly 13% and save 24% memory. Maybe,
it is due to overhead of calculating checksum and comparison.
Test result #4 (Blockdev):
copy android build output dir(out/target/common) to ext4 FS on zram
<no-dedup>
Elapsed time: out/host: 203 s
mm_stat: 4041678848 2310355010 2346577920 0 2346582016 500 4 0 0
<dedup>
Elapsed time: out/host: 201 s
mm_stat: 4041666560 2310488276 1338150912 0 1338150912 476 0 989088794 24564336
Memory is saved by 42% and performance is the same. Even if there is overhead
of calculating checksum and comparison, large hit ratio compensate it since
hit leads to less compression attempt.
I checked the detailed reason of savings on kernel build workload and
there are some cases that deduplication happens.
1) *.cmd
Build command is usually similar in one directory so content of these file
are very similar. In my system, more than 789 lines in fs/ext4/.namei.o.cmd
and fs/ext4/.inode.o.cmd are the same in 944 and 938 lines of the file,
respectively.
2) intermediate object files
built-in.o and temporary object file have the similar contents. More than
50% of fs/ext4/ext4.o is the same with fs/ext4/built-in.o.
3) vmlinux
.tmp_vmlinux1 and .tmp_vmlinux2 and arch/x86/boo/compressed/vmlinux.bin
have the similar contents.
Android test has similar case that some of object files(.class
and .so) are similar with another ones.
(./host/linux-x86/lib/libartd.so and
./host/linux-x86-lib/libartd-comiler.so)
Anyway, benefit seems to be largely dependent on the workload so
following patch will make this feature optional. However, this feature
can help some usecases so is deserved to be merged.
[1]: MemScope: Analyzing Memory Duplication on Android Systems,
dl.acm.org/citation.cfm?id=2797023
[2]: zswap: Optimize compressed pool memory utilization,
lkml.kernel.org/r/1341407574.7551.1471584870761.JavaMail.weblogic@epwas3p2
Signed-off-by: Joonsoo Kim <[email protected]>
---
Documentation/blockdev/zram.txt | 2 +
drivers/block/zram/Makefile | 2 +-
drivers/block/zram/zram_dedup.c | 222 ++++++++++++++++++++++++++++++++++++++++
drivers/block/zram/zram_dedup.h | 25 +++++
drivers/block/zram/zram_drv.c | 62 ++++++++---
drivers/block/zram/zram_drv.h | 18 ++++
6 files changed, 314 insertions(+), 17 deletions(-)
create mode 100644 drivers/block/zram/zram_dedup.c
create mode 100644 drivers/block/zram/zram_dedup.h
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 4fced8a..2cdc303 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -217,6 +217,8 @@ line of text and contains the following stats separated by whitespace:
same_pages the number of same element filled pages written to this disk.
No memory is allocated for such pages.
pages_compacted the number of pages freed during compaction
+ dup_data_size deduplicated data size
+ meta_data_size the amount of metadata allocated for deduplication feature
9) Deactivate:
swapoff /dev/zram0
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 9e2b79e..29cb008 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,3 @@
-zram-y := zcomp.o zram_drv.o
+zram-y := zcomp.o zram_drv.o zram_dedup.o
obj-$(CONFIG_ZRAM) += zram.o
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
new file mode 100644
index 0000000..d313fc8
--- /dev/null
+++ b/drivers/block/zram/zram_dedup.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2017 Joonsoo Kim.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/jhash.h>
+
+#include "zram_drv.h"
+
+/* One slot will contain 128 pages theoretically */
+#define ZRAM_HASH_SHIFT 7
+#define ZRAM_HASH_SIZE_MIN (1 << 10)
+#define ZRAM_HASH_SIZE_MAX (1 << 31)
+
+u64 zram_dedup_dup_size(struct zram *zram)
+{
+ return (u64)atomic64_read(&zram->stats.dup_data_size);
+}
+
+u64 zram_dedup_meta_size(struct zram *zram)
+{
+ return (u64)atomic64_read(&zram->stats.meta_data_size);
+}
+
+static u32 zram_dedup_checksum(unsigned char *mem)
+{
+ return jhash(mem, PAGE_SIZE, 0);
+}
+
+void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum)
+{
+ struct zram_meta *meta = zram->meta;
+ struct zram_hash *hash;
+ struct rb_root *rb_root;
+ struct rb_node **rb_node, *parent = NULL;
+ struct zram_entry *entry;
+
+ new->checksum = checksum;
+ hash = &meta->hash[checksum % meta->hash_size];
+ rb_root = &hash->rb_root;
+
+ spin_lock(&hash->lock);
+ rb_node = &rb_root->rb_node;
+ while (*rb_node) {
+ parent = *rb_node;
+ entry = rb_entry(parent, struct zram_entry, rb_node);
+ if (checksum < entry->checksum)
+ rb_node = &parent->rb_left;
+ else if (checksum > entry->checksum)
+ rb_node = &parent->rb_right;
+ else
+ rb_node = &parent->rb_left;
+ }
+
+ rb_link_node(&new->rb_node, parent, rb_node);
+ rb_insert_color(&new->rb_node, rb_root);
+ spin_unlock(&hash->lock);
+}
+
+static bool zram_dedup_match(struct zram *zram, struct zram_entry *entry,
+ unsigned char *mem)
+{
+ bool match = false;
+ unsigned char *cmem;
+ struct zram_meta *meta = zram->meta;
+ struct zcomp_strm *zstrm;
+
+ cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_RO);
+ if (entry->len == PAGE_SIZE) {
+ match = !memcmp(mem, cmem, PAGE_SIZE);
+ } else {
+ zstrm = zcomp_stream_get(zram->comp);
+ if (!zcomp_decompress(zstrm, cmem, entry->len, zstrm->buffer))
+ match = !memcmp(mem, zstrm->buffer, PAGE_SIZE);
+ zcomp_stream_put(zram->comp);
+ }
+ zs_unmap_object(meta->mem_pool, entry->handle);
+
+ return match;
+}
+
+static unsigned long zram_dedup_put(struct zram *zram, struct zram_meta *meta,
+ struct zram_entry *entry)
+{
+ struct zram_hash *hash;
+ u32 checksum;
+ unsigned long refcount;
+
+ checksum = entry->checksum;
+ hash = &meta->hash[checksum % meta->hash_size];
+
+ spin_lock(&hash->lock);
+ entry->refcount--;
+ refcount = entry->refcount;
+ if (!entry->refcount) {
+ rb_erase(&entry->rb_node, &hash->rb_root);
+ RB_CLEAR_NODE(&entry->rb_node);
+ } else if (zram) {
+ atomic64_sub(entry->len, &zram->stats.dup_data_size);
+ }
+ spin_unlock(&hash->lock);
+
+ return refcount;
+}
+
+static struct zram_entry *zram_dedup_get(struct zram *zram,
+ unsigned char *mem, u32 checksum)
+{
+ struct zram_meta *meta = zram->meta;
+ struct zram_hash *hash;
+ struct zram_entry *entry;
+ struct rb_node *rb_node;
+
+ hash = &meta->hash[checksum % meta->hash_size];
+
+ spin_lock(&hash->lock);
+ rb_node = hash->rb_root.rb_node;
+ while (rb_node) {
+ entry = rb_entry(rb_node, struct zram_entry, rb_node);
+ if (checksum == entry->checksum) {
+ entry->refcount++;
+ atomic64_add(entry->len, &zram->stats.dup_data_size);
+ spin_unlock(&hash->lock);
+
+ if (zram_dedup_match(zram, entry, mem))
+ return entry;
+
+ zram_entry_free(zram, meta, entry);
+
+ return NULL;
+ }
+
+ if (checksum < entry->checksum)
+ rb_node = rb_node->rb_left;
+ else
+ rb_node = rb_node->rb_right;
+ }
+ spin_unlock(&hash->lock);
+
+ return NULL;
+}
+
+struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
+ u32 *checksum)
+{
+ *checksum = zram_dedup_checksum(mem);
+
+ return zram_dedup_get(zram, mem, *checksum);
+}
+
+struct zram_entry *zram_dedup_alloc(struct zram *zram,
+ unsigned long handle, unsigned int len,
+ gfp_t flags)
+{
+ struct zram_entry *entry;
+
+ entry = kzalloc(sizeof(*entry),
+ flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ if (!entry)
+ return NULL;
+
+ entry->handle = handle;
+ RB_CLEAR_NODE(&entry->rb_node);
+ entry->refcount = 1;
+ entry->len = len;
+
+ atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);
+
+ return entry;
+}
+
+unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
+ struct zram_entry *entry)
+{
+ unsigned long handle;
+
+ if (zram_dedup_put(zram, meta, entry))
+ return 0;
+
+ handle = entry->handle;
+ kfree(entry);
+
+ /* !zram happens when reset/fail and updating stat is useless */
+ if (zram)
+ atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
+
+ return handle;
+}
+
+int zram_dedup_init(struct zram_meta *meta, size_t num_pages)
+{
+ int i;
+ struct zram_hash *hash;
+
+ meta->hash_size = num_pages >> ZRAM_HASH_SHIFT;
+ meta->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, meta->hash_size);
+ meta->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, meta->hash_size);
+ meta->hash = vzalloc(meta->hash_size * sizeof(struct zram_hash));
+ if (!meta->hash) {
+ pr_err("Error allocating zram entry hash\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < meta->hash_size; i++) {
+ hash = &meta->hash[i];
+ spin_lock_init(&hash->lock);
+ hash->rb_root = RB_ROOT;
+ }
+
+ return 0;
+}
+
+void zram_dedup_fini(struct zram_meta *meta)
+{
+ vfree(meta->hash);
+}
diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h
new file mode 100644
index 0000000..7071f32
--- /dev/null
+++ b/drivers/block/zram/zram_dedup.h
@@ -0,0 +1,25 @@
+#ifndef _ZRAM_DEDUP_H_
+#define _ZRAM_DEDUP_H_
+
+struct zram;
+struct zram_meta;
+struct zram_entry;
+
+u64 zram_dedup_dup_size(struct zram *zram);
+u64 zram_dedup_meta_size(struct zram *zram);
+
+void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum);
+struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
+ u32 *checksum);
+
+struct zram_entry *zram_dedup_alloc(struct zram *zram,
+ unsigned long handle, unsigned int len,
+ gfp_t flags);
+unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
+ struct zram_entry *entry);
+
+int zram_dedup_init(struct zram_meta *meta, size_t num_pages);
+void zram_dedup_fini(struct zram_meta *meta);
+
+#endif /* _ZRAM_DEDUP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f3949da..15cecd6 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -385,14 +385,16 @@ static ssize_t mm_stat_show(struct device *dev,
max_used = atomic_long_read(&zram->stats.max_used_pages);
ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.same_pages),
- pool_stats.pages_compacted);
+ pool_stats.pages_compacted,
+ zram_dedup_dup_size(zram),
+ zram_dedup_meta_size(zram));
up_read(&zram->init_lock);
return ret;
@@ -424,25 +426,29 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,
{
struct zram_meta *meta = zram->meta;
struct zram_entry *entry;
+ unsigned long handle;
- entry = kzalloc(sizeof(*entry), flags);
- if (!entry)
+ handle = zs_malloc(meta->mem_pool, len, flags);
+ if (!handle)
return NULL;
- entry->handle = zs_malloc(meta->mem_pool, len, flags);
- if (!entry->handle) {
- kfree(entry);
+ entry = zram_dedup_alloc(zram, handle, len, flags);
+ if (!entry) {
+ zs_free(meta->mem_pool, handle);
return NULL;
}
return entry;
}
-static inline void zram_entry_free(struct zram_meta *meta,
- struct zram_entry *entry)
+void zram_entry_free(struct zram *zram, struct zram_meta *meta,
+ struct zram_entry *entry)
{
- zs_free(meta->mem_pool, entry->handle);
- kfree(entry);
+ unsigned long handle;
+
+ handle = zram_dedup_free(zram, meta, entry);
+ if (handle)
+ zs_free(meta->mem_pool, handle);
}
static void zram_meta_free(struct zram_meta *meta, u64 disksize)
@@ -460,10 +466,11 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)
if (!entry || zram_test_flag(meta, index, ZRAM_SAME))
continue;
- zram_entry_free(meta, entry);
+ zram_entry_free(NULL, meta, entry);
}
zs_destroy_pool(meta->mem_pool);
+ zram_dedup_fini(meta);
vfree(meta->table);
kfree(meta);
}
@@ -471,7 +478,7 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)
static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
{
size_t num_pages;
- struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+ struct zram_meta *meta = kzalloc(sizeof(*meta), GFP_KERNEL);
if (!meta)
return NULL;
@@ -483,6 +490,11 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
goto out_error;
}
+ if (zram_dedup_init(meta, num_pages)) {
+ pr_err("Error initializing zram entry hash\n");
+ goto out_error;
+ }
+
meta->mem_pool = zs_create_pool(pool_name);
if (!meta->mem_pool) {
pr_err("Error creating memory pool\n");
@@ -492,6 +504,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
return meta;
out_error:
+ zram_dedup_fini(meta);
vfree(meta->table);
kfree(meta);
return NULL;
@@ -521,7 +534,7 @@ static void zram_free_page(struct zram *zram, size_t index)
if (!entry)
return;
- zram_entry_free(meta, entry);
+ zram_entry_free(zram, meta, entry);
atomic64_sub(zram_get_obj_size(meta, index),
&zram->stats.compr_data_size);
@@ -625,13 +638,14 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
{
int ret = 0;
unsigned int clen;
- struct zram_entry *entry = NULL;
+ struct zram_entry *entry = NULL, *found_entry;
struct page *page;
unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
struct zram_meta *meta = zram->meta;
struct zcomp_strm *zstrm = NULL;
unsigned long alloced_pages;
unsigned long element;
+ u32 checksum;
page = bvec->bv_page;
if (is_partial_io(bvec)) {
@@ -675,6 +689,20 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}
+ found_entry = zram_dedup_find(zram, uncmem, &checksum);
+ if (found_entry) {
+ if (!is_partial_io(bvec))
+ kunmap_atomic(user_mem);
+
+ if (entry)
+ zram_entry_free(zram, meta, entry);
+
+ entry = found_entry;
+ clen = entry->len;
+
+ goto found_dup;
+ }
+
zstrm = zcomp_stream_get(zram->comp);
ret = zcomp_compress(zstrm, uncmem, &clen);
if (!is_partial_io(bvec)) {
@@ -738,7 +766,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
update_used_max(zram, alloced_pages);
if (zram->limit_pages && alloced_pages > zram->limit_pages) {
- zram_entry_free(meta, entry);
+ zram_entry_free(zram, meta, entry);
ret = -ENOMEM;
goto out;
}
@@ -756,7 +784,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
zcomp_stream_put(zram->comp);
zstrm = NULL;
zs_unmap_object(meta->mem_pool, entry->handle);
+ zram_dedup_insert(zram, entry, checksum);
+found_dup:
/*
* Free memory associated with this sector
* before overwriting unused sectors.
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index a7ae46c..968e269 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -18,8 +18,10 @@
#include <linux/rwsem.h>
#include <linux/zsmalloc.h>
#include <linux/crypto.h>
+#include <linux/spinlock.h>
#include "zcomp.h"
+#include "zram_dedup.h"
/*-- Configurable parameters */
@@ -70,6 +72,10 @@ enum zram_pageflags {
/*-- Data structures */
struct zram_entry {
+ struct rb_node rb_node;
+ u32 len;
+ u32 checksum;
+ unsigned long refcount;
unsigned long handle;
};
@@ -94,11 +100,20 @@ struct zram_stats {
atomic64_t pages_stored; /* no. of pages currently stored */
atomic_long_t max_used_pages; /* no. of maximum pages stored */
atomic64_t writestall; /* no. of write slow paths */
+ atomic64_t dup_data_size; /* compressed size of pages duplicated */
+ atomic64_t meta_data_size; /* size of zram_entries */
+};
+
+struct zram_hash {
+ spinlock_t lock;
+ struct rb_root rb_root;
};
struct zram_meta {
struct zram_table_entry *table;
struct zs_pool *mem_pool;
+ struct zram_hash *hash;
+ size_t hash_size;
};
struct zram {
@@ -124,4 +139,7 @@ struct zram {
*/
bool claim; /* Protected by bdev->bd_mutex */
};
+
+void zram_entry_free(struct zram *zram, struct zram_meta *meta,
+ struct zram_entry *entry);
#endif
--
2.7.4
From: Joonsoo Kim <[email protected]>
Until now, we compare just one entry with same checksum when
checking duplication since it is the simplest way to implement.
However, for the completeness, checking all the entries is better
so this patch implement to compare all the entries with same checksum.
Since this event would be rare so there would be no performance loss.
Signed-off-by: Joonsoo Kim <[email protected]>
---
drivers/block/zram/zram_dedup.c | 59 +++++++++++++++++++++++++++++++++--------
1 file changed, 48 insertions(+), 11 deletions(-)
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
index 1df1ce1..c4dfd21 100644
--- a/drivers/block/zram/zram_dedup.c
+++ b/drivers/block/zram/zram_dedup.c
@@ -125,6 +125,51 @@ static unsigned long zram_dedup_put(struct zram *zram, struct zram_meta *meta,
return refcount;
}
+static struct zram_entry *__zram_dedup_get(struct zram *zram,
+ struct zram_hash *hash, unsigned char *mem,
+ struct zram_entry *entry)
+{
+ struct zram_entry *tmp, *prev = NULL;
+ struct rb_node *rb_node;
+
+ /* find left-most entry with same checksum */
+ while ((rb_node = rb_prev(&entry->rb_node))) {
+ tmp = rb_entry(rb_node, struct zram_entry, rb_node);
+ if (tmp->checksum != entry->checksum)
+ break;
+
+ entry = tmp;
+ }
+
+again:
+ entry->refcount++;
+ atomic64_add(entry->len, &zram->stats.dup_data_size);
+ spin_unlock(&hash->lock);
+
+ if (prev)
+ zram_entry_free(zram, zram->meta, prev);
+
+ if (zram_dedup_match(zram, entry, mem))
+ return entry;
+
+ spin_lock(&hash->lock);
+ tmp = NULL;
+ rb_node = rb_next(&entry->rb_node);
+ if (rb_node)
+ tmp = rb_entry(rb_node, struct zram_entry, rb_node);
+
+ if (tmp && (tmp->checksum == entry->checksum)) {
+ prev = entry;
+ entry = tmp;
+ goto again;
+ }
+
+ spin_unlock(&hash->lock);
+ zram_entry_free(zram, zram->meta, entry);
+
+ return NULL;
+}
+
static struct zram_entry *zram_dedup_get(struct zram *zram,
unsigned char *mem, u32 checksum)
{
@@ -139,18 +184,10 @@ static struct zram_entry *zram_dedup_get(struct zram *zram,
rb_node = hash->rb_root.rb_node;
while (rb_node) {
entry = rb_entry(rb_node, struct zram_entry, rb_node);
- if (checksum == entry->checksum) {
- entry->refcount++;
- atomic64_add(entry->len, &zram->stats.dup_data_size);
- spin_unlock(&hash->lock);
-
- if (zram_dedup_match(zram, entry, mem))
- return entry;
-
- zram_entry_free(zram, meta, entry);
- return NULL;
- }
+ /* lock will be released in the following function */
+ if (checksum == entry->checksum)
+ return __zram_dedup_get(zram, hash, mem, entry);
if (checksum < entry->checksum)
rb_node = rb_node->rb_left;
--
2.7.4
From: Joonsoo Kim <[email protected]>
Benefit of deduplication is dependent on the workload so it's not
preferable to always enable. Therefore, make it optional in Kconfig
and device param. Default is 'off'. This option will be beneficial
for users who use the zram as blockdev and stores build output to it.
Signed-off-by: Joonsoo Kim <[email protected]>
---
Documentation/ABI/testing/sysfs-block-zram | 10 +++++
Documentation/blockdev/zram.txt | 1 +
drivers/block/zram/Kconfig | 14 +++++++
drivers/block/zram/Makefile | 5 ++-
drivers/block/zram/zram_dedup.c | 31 ++++++++++++++-
drivers/block/zram/zram_dedup.h | 33 +++++++++++++++-
drivers/block/zram/zram_drv.c | 62 ++++++++++++++++++++++++++----
drivers/block/zram/zram_drv.h | 1 +
8 files changed, 146 insertions(+), 11 deletions(-)
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 451b6d8..3c1945f 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -90,3 +90,13 @@ Description:
device's debugging info useful for kernel developers. Its
format is not documented intentionally and may change
anytime without any notice.
+
+What: /sys/block/zram<id>/use_dedup
+Date: March 2017
+Contact: Joonsoo Kim <[email protected]>
+Description:
+ The use_dedup file is read-write and specifies deduplication
+ feature is used or not. If enabled, duplicated data is
+ managed by reference count and will not be stored in memory
+ twice. Benefit of this feature largely depends on the workload
+ so keep attention when use.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 2cdc303..cbbe39b 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -168,6 +168,7 @@ max_comp_streams RW the number of possible concurrent compress operations
comp_algorithm RW show and change the compression algorithm
compact WO trigger memory compaction
debug_stat RO this file is used for zram debugging purposes
+use_dedup RW show and set deduplication feature
User space is advised to use the following files to read the device statistics.
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index b8ecba6..2f3dd1f 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -13,3 +13,17 @@ config ZRAM
disks and maybe many more.
See zram.txt for more information.
+
+config ZRAM_DEDUP
+ bool "Deduplication support for ZRAM data"
+ depends on ZRAM
+ default n
+ help
+ Deduplicate ZRAM data to reduce amount of memory consumption.
+ Advantage largely depends on the workload. In some cases, this
+ option reduces memory usage to the half. However, if there is no
+ duplicated data, the amount of memory consumption would be
+ increased due to additional metadata usage. And, there is
+ computation time trade-off. Please check the benefit before
+ enabling this option. Experiment shows the positive effect when
+ the zram is used as blockdev and is used to store build output.
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 29cb008..1f6fecd 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,4 @@
-zram-y := zcomp.o zram_drv.o zram_dedup.o
+zram-y := zcomp.o zram_drv.o
-obj-$(CONFIG_ZRAM) += zram.o
+obj-$(CONFIG_ZRAM) += zram.o
+obj-$(CONFIG_ZRAM_DEDUP) += zram_dedup.o
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
index d313fc8..1df1ce1 100644
--- a/drivers/block/zram/zram_dedup.c
+++ b/drivers/block/zram/zram_dedup.c
@@ -27,6 +27,19 @@ u64 zram_dedup_meta_size(struct zram *zram)
return (u64)atomic64_read(&zram->stats.meta_data_size);
}
+static inline bool zram_dedup_enabled(struct zram_meta *meta)
+{
+ return meta->hash;
+}
+
+unsigned long zram_dedup_handle(struct zram *zram, struct zram_entry *entry)
+{
+ if (!zram_dedup_enabled(zram->meta))
+ return (unsigned long)entry;
+
+ return entry->handle;
+}
+
static u32 zram_dedup_checksum(unsigned char *mem)
{
return jhash(mem, PAGE_SIZE, 0);
@@ -41,6 +54,9 @@ void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
struct rb_node **rb_node, *parent = NULL;
struct zram_entry *entry;
+ if (!zram_dedup_enabled(zram->meta))
+ return;
+
new->checksum = checksum;
hash = &meta->hash[checksum % meta->hash_size];
rb_root = &hash->rb_root;
@@ -149,6 +165,9 @@ static struct zram_entry *zram_dedup_get(struct zram *zram,
struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
u32 *checksum)
{
+ if (!zram_dedup_enabled(zram->meta))
+ return NULL;
+
*checksum = zram_dedup_checksum(mem);
return zram_dedup_get(zram, mem, *checksum);
@@ -160,6 +179,9 @@ struct zram_entry *zram_dedup_alloc(struct zram *zram,
{
struct zram_entry *entry;
+ if (!zram_dedup_enabled(zram->meta))
+ return (struct zram_entry *)handle;
+
entry = kzalloc(sizeof(*entry),
flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
if (!entry)
@@ -180,6 +202,9 @@ unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
{
unsigned long handle;
+ if (!zram_dedup_enabled(meta))
+ return (unsigned long)entry;
+
if (zram_dedup_put(zram, meta, entry))
return 0;
@@ -193,11 +218,14 @@ unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
return handle;
}
-int zram_dedup_init(struct zram_meta *meta, size_t num_pages)
+int zram_dedup_init(struct zram *zram, struct zram_meta *meta, size_t num_pages)
{
int i;
struct zram_hash *hash;
+ if (!zram->use_dedup)
+ return 0;
+
meta->hash_size = num_pages >> ZRAM_HASH_SHIFT;
meta->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, meta->hash_size);
meta->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, meta->hash_size);
@@ -219,4 +247,5 @@ int zram_dedup_init(struct zram_meta *meta, size_t num_pages)
void zram_dedup_fini(struct zram_meta *meta)
{
vfree(meta->hash);
+ meta->hash = NULL;
}
diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h
index 7071f32..470009a 100644
--- a/drivers/block/zram/zram_dedup.h
+++ b/drivers/block/zram/zram_dedup.h
@@ -5,9 +5,11 @@ struct zram;
struct zram_meta;
struct zram_entry;
+#ifdef CONFIG_ZRAM_DEDUP
u64 zram_dedup_dup_size(struct zram *zram);
u64 zram_dedup_meta_size(struct zram *zram);
+unsigned long zram_dedup_handle(struct zram *zram, struct zram_entry *entry);
void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
u32 checksum);
struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
@@ -19,7 +21,36 @@ struct zram_entry *zram_dedup_alloc(struct zram *zram,
unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
struct zram_entry *entry);
-int zram_dedup_init(struct zram_meta *meta, size_t num_pages);
+int zram_dedup_init(struct zram *zram, struct zram_meta *meta,
+ size_t num_pages);
void zram_dedup_fini(struct zram_meta *meta);
+#else
+
+static inline u64 zram_dedup_dup_size(struct zram *zram) { return 0; }
+static inline u64 zram_dedup_meta_size(struct zram *zram) { return 0; }
+
+static inline unsigned long zram_dedup_handle(struct zram *zram,
+ struct zram_entry *entry) { return (unsigned long)entry; }
+static inline void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum) { }
+static inline struct zram_entry *zram_dedup_find(struct zram *zram,
+ unsigned char *mem, u32 *checksum) { return NULL; }
+
+static inline struct zram_entry *zram_dedup_alloc(struct zram *zram,
+ unsigned long handle, unsigned int len, gfp_t flags)
+{
+ return (struct zram_entry *)handle;
+}
+static inline unsigned long zram_dedup_free(struct zram *zram,
+ struct zram_meta *meta, struct zram_entry *entry)
+{
+ return (unsigned long)entry;
+}
+
+static inline int zram_dedup_init(struct zram *zram, struct zram_meta *meta,
+ size_t num_pages) { return 0; }
+void zram_dedup_fini(struct zram_meta *meta) { }
+
+#endif
#endif /* _ZRAM_DEDUP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 15cecd6..016b2ee 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -327,6 +327,43 @@ static ssize_t comp_algorithm_store(struct device *dev,
return len;
}
+static ssize_t use_dedup_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ bool val;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ val = zram->use_dedup;
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", (int)val);
+}
+
+static ssize_t use_dedup_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+#ifdef CONFIG_ZRAM_DEDUP
+ int val;
+ struct zram *zram = dev_to_zram(dev);
+
+ if (kstrtoint(buf, 10, &val) || (val != 0 && val != 1))
+ return -EINVAL;
+
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ up_write(&zram->init_lock);
+ pr_info("Can't change dedup usage for initialized device\n");
+ return -EBUSY;
+ }
+ zram->use_dedup = val;
+ up_write(&zram->init_lock);
+ return len;
+#else
+ return -EINVAL;
+#endif
+}
+
static ssize_t compact_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
@@ -421,6 +458,12 @@ static DEVICE_ATTR_RO(io_stat);
static DEVICE_ATTR_RO(mm_stat);
static DEVICE_ATTR_RO(debug_stat);
+static unsigned long zram_entry_handle(struct zram *zram,
+ struct zram_entry *entry)
+{
+ return zram_dedup_handle(zram, entry);
+}
+
static struct zram_entry *zram_entry_alloc(struct zram *zram,
unsigned int len, gfp_t flags)
{
@@ -475,9 +518,10 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)
kfree(meta);
}
-static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
+static struct zram_meta *zram_meta_alloc(struct zram *zram, u64 disksize)
{
size_t num_pages;
+ char *pool_name = zram->disk->disk_name;
struct zram_meta *meta = kzalloc(sizeof(*meta), GFP_KERNEL);
if (!meta)
@@ -490,7 +534,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
goto out_error;
}
- if (zram_dedup_init(meta, num_pages)) {
+ if (zram_dedup_init(zram, meta, num_pages)) {
pr_err("Error initializing zram entry hash\n");
goto out_error;
}
@@ -562,7 +606,8 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
return 0;
}
- cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_RO);
+ cmem = zs_map_object(meta->mem_pool,
+ zram_entry_handle(zram, entry), ZS_MM_RO);
if (size == PAGE_SIZE) {
copy_page(mem, cmem);
} else {
@@ -571,7 +616,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
ret = zcomp_decompress(zstrm, cmem, size, mem);
zcomp_stream_put(zram->comp);
}
- zs_unmap_object(meta->mem_pool, entry->handle);
+ zs_unmap_object(meta->mem_pool, zram_entry_handle(zram, entry));
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
/* Should NEVER happen. Return bio error if it does. */
@@ -771,7 +816,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}
- cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_WO);
+ cmem = zs_map_object(meta->mem_pool,
+ zram_entry_handle(zram, entry), ZS_MM_WO);
if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
src = kmap_atomic(page);
@@ -783,7 +829,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
zcomp_stream_put(zram->comp);
zstrm = NULL;
- zs_unmap_object(meta->mem_pool, entry->handle);
+ zs_unmap_object(meta->mem_pool, zram_entry_handle(zram, entry));
zram_dedup_insert(zram, entry, checksum);
found_dup:
@@ -1055,7 +1101,7 @@ static ssize_t disksize_store(struct device *dev,
return -EINVAL;
disksize = PAGE_ALIGN(disksize);
- meta = zram_meta_alloc(zram->disk->disk_name, disksize);
+ meta = zram_meta_alloc(zram, disksize);
if (!meta)
return -ENOMEM;
@@ -1166,6 +1212,7 @@ static DEVICE_ATTR_WO(mem_limit);
static DEVICE_ATTR_WO(mem_used_max);
static DEVICE_ATTR_RW(max_comp_streams);
static DEVICE_ATTR_RW(comp_algorithm);
+static DEVICE_ATTR_RW(use_dedup);
static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
@@ -1176,6 +1223,7 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
+ &dev_attr_use_dedup.attr,
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
&dev_attr_debug_stat.attr,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 968e269..53be62a 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -138,6 +138,7 @@ struct zram {
* zram is claimed so open request will be failed
*/
bool claim; /* Protected by bdev->bd_mutex */
+ bool use_dedup;
};
void zram_entry_free(struct zram *zram, struct zram_meta *meta,
--
2.7.4
On Thu, 30 Mar 2017 14:38:05 +0900 [email protected] wrote:
> This patchset implements deduplication feature in zram. Motivation
> is to save memory usage by zram. There are detailed description
> about motivation and experimental results on patch #2 so please
> refer it.
I'm getting a lot of rejects against the current -mm tree. So this is
one of those patchsets which should be prepared against -mm, please.
I'll be attempting to release another mmotm in the next couple of hours.
Hi Andrew and Joonsoo,
On Thu, Mar 30, 2017 at 03:25:02PM -0700, Andrew Morton wrote:
> On Thu, 30 Mar 2017 14:38:05 +0900 [email protected] wrote:
>
> > This patchset implements deduplication feature in zram. Motivation
> > is to save memory usage by zram. There are detailed description
> > about motivation and experimental results on patch #2 so please
> > refer it.
>
> I'm getting a lot of rejects against the current -mm tree. So this is
> one of those patchsets which should be prepared against -mm, please.
Due to my partial IO refactoring. Sorry for the inconvenience.
It's still on-going and I want to merge it before new feature like
dedup.
Joonsoo,
I will send partial-IO rework formal patch next week and start to
review your patchset so I hope you resend your patchset based on it
with my comments.
Thanks.
On Fri, Mar 31, 2017 at 08:40:56AM +0900, Minchan Kim wrote:
> Hi Andrew and Joonsoo,
>
> On Thu, Mar 30, 2017 at 03:25:02PM -0700, Andrew Morton wrote:
> > On Thu, 30 Mar 2017 14:38:05 +0900 [email protected] wrote:
> >
> > > This patchset implements deduplication feature in zram. Motivation
> > > is to save memory usage by zram. There are detailed description
> > > about motivation and experimental results on patch #2 so please
> > > refer it.
> >
> > I'm getting a lot of rejects against the current -mm tree. So this is
> > one of those patchsets which should be prepared against -mm, please.
>
> Due to my partial IO refactoring. Sorry for the inconvenience.
> It's still on-going and I want to merge it before new feature like
> dedup.
>
> Joonsoo,
>
> I will send partial-IO rework formal patch next week and start to
> review your patchset so I hope you resend your patchset based on it
> with my comments.
No problem. I will wait. :)
Thanks.
Hi Joonsoo,
I reviewed this patch and overall, looks great! Thanks.
However, as you know, recently, zram had lots of clean up so
this patchset should be rebased on it massively.
Sorry for the inconvenience.
And there are some minor in below. I hope you handle them in
next submit, please.
On Thu, Mar 30, 2017 at 02:38:07PM +0900, [email protected] wrote:
> From: Joonsoo Kim <[email protected]>
>
> This patch implements deduplication feature in zram. The purpose
> of this work is naturally to save amount of memory usage by zram.
>
> Android is one of the biggest users to use zram as swap and it's
> really important to save amount of memory usage. There is a paper
> that reports that duplication ratio of Android's memory content is
> rather high [1]. And, there is a similar work on zswap that also
> reports that experiments has shown that around 10-15% of pages
> stored in zswp are duplicates and deduplicate them provides some
> benefits [2].
>
> Also, there is a different kind of workload that uses zram as blockdev
> and store build outputs into it to reduce wear-out problem of real
> blockdev. In this workload, deduplication hit is very high due to
> temporary files and intermediate object files. Detailed analysis is
> on the bottom of this description.
>
> Anyway, if we can detect duplicated content and avoid to store duplicated
> content at different memory space, we can save memory. This patch
> tries to do that.
>
> Implementation is almost simple and intuitive but I should note
> one thing about implementation detail.
>
> To check duplication, this patch uses checksum of the page and
> collision of this checksum could be possible. There would be
> many choices to handle this situation but this patch chooses
> to allow entry with duplicated checksum to be added to the hash,
> but, not to compare all entries with duplicated checksum
> when checking duplication. I guess that checksum collision is quite
> rare event and we don't need to pay any attention to such a case.
> Therefore, I decided the most simplest way to implement the feature.
> If there is a different opinion, I can accept and go that way.
>
> Following is the result of this patch.
>
> Test result #1 (Swap):
> Android Marshmallow, emulator, x86_64, Backporting to kernel v3.18
>
> orig_data_size: 145297408
> compr_data_size: 32408125
> mem_used_total: 32276480
> dup_data_size: 3188134
> meta_data_size: 1444272
>
> Last two metrics added to mm_stat are related to this work.
> First one, dup_data_size, is amount of saved memory by avoiding
> to store duplicated page. Later one, meta_data_size, is the amount of
> data structure to support deduplication. If dup > meta, we can judge
> that the patch improves memory usage.
>
> In Adnroid, we can save 5% of memory usage by this work.
>
> Test result #2 (Blockdev):
> build the kernel and store output to ext4 FS on zram
>
> <no-dedup>
> Elapsed time: 249 s
> mm_stat: 430845952 191014886 196898816 0 196898816 28320 0 0 0
>
> <dedup>
> Elapsed time: 250 s
> mm_stat: 430505984 190971334 148365312 0 148365312 28404 0 47287038 3945792
>
> There is no performance degradation and save 23% memory.
>
> Test result #3 (Blockdev):
> copy android build output dir(out/host) to ext4 FS on zram
>
> <no-dedup>
> Elapsed time: out/host: 88 s
> mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
>
> <dedup>
> Elapsed time: out/host: 100 s
> mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336
>
> It shows performance degradation roughly 13% and save 24% memory. Maybe,
> it is due to overhead of calculating checksum and comparison.
>
> Test result #4 (Blockdev):
> copy android build output dir(out/target/common) to ext4 FS on zram
>
> <no-dedup>
> Elapsed time: out/host: 203 s
> mm_stat: 4041678848 2310355010 2346577920 0 2346582016 500 4 0 0
>
> <dedup>
> Elapsed time: out/host: 201 s
> mm_stat: 4041666560 2310488276 1338150912 0 1338150912 476 0 989088794 24564336
>
> Memory is saved by 42% and performance is the same. Even if there is overhead
> of calculating checksum and comparison, large hit ratio compensate it since
> hit leads to less compression attempt.
>
> I checked the detailed reason of savings on kernel build workload and
> there are some cases that deduplication happens.
>
> 1) *.cmd
> Build command is usually similar in one directory so content of these file
> are very similar. In my system, more than 789 lines in fs/ext4/.namei.o.cmd
> and fs/ext4/.inode.o.cmd are the same in 944 and 938 lines of the file,
> respectively.
>
> 2) intermediate object files
> built-in.o and temporary object file have the similar contents. More than
> 50% of fs/ext4/ext4.o is the same with fs/ext4/built-in.o.
>
> 3) vmlinux
> .tmp_vmlinux1 and .tmp_vmlinux2 and arch/x86/boo/compressed/vmlinux.bin
> have the similar contents.
>
> Android test has similar case that some of object files(.class
> and .so) are similar with another ones.
> (./host/linux-x86/lib/libartd.so and
> ./host/linux-x86-lib/libartd-comiler.so)
>
> Anyway, benefit seems to be largely dependent on the workload so
> following patch will make this feature optional. However, this feature
> can help some usecases so is deserved to be merged.
>
> [1]: MemScope: Analyzing Memory Duplication on Android Systems,
> dl.acm.org/citation.cfm?id=2797023
> [2]: zswap: Optimize compressed pool memory utilization,
> lkml.kernel.org/r/1341407574.7551.1471584870761.JavaMail.weblogic@epwas3p2
>
> Signed-off-by: Joonsoo Kim <[email protected]>
> ---
> Documentation/blockdev/zram.txt | 2 +
> drivers/block/zram/Makefile | 2 +-
> drivers/block/zram/zram_dedup.c | 222 ++++++++++++++++++++++++++++++++++++++++
> drivers/block/zram/zram_dedup.h | 25 +++++
> drivers/block/zram/zram_drv.c | 62 ++++++++---
> drivers/block/zram/zram_drv.h | 18 ++++
> 6 files changed, 314 insertions(+), 17 deletions(-)
> create mode 100644 drivers/block/zram/zram_dedup.c
> create mode 100644 drivers/block/zram/zram_dedup.h
>
> diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
> index 4fced8a..2cdc303 100644
> --- a/Documentation/blockdev/zram.txt
> +++ b/Documentation/blockdev/zram.txt
> @@ -217,6 +217,8 @@ line of text and contains the following stats separated by whitespace:
> same_pages the number of same element filled pages written to this disk.
> No memory is allocated for such pages.
> pages_compacted the number of pages freed during compaction
> + dup_data_size deduplicated data size
> + meta_data_size the amount of metadata allocated for deduplication feature
>
> 9) Deactivate:
> swapoff /dev/zram0
> diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
> index 9e2b79e..29cb008 100644
> --- a/drivers/block/zram/Makefile
> +++ b/drivers/block/zram/Makefile
> @@ -1,3 +1,3 @@
> -zram-y := zcomp.o zram_drv.o
> +zram-y := zcomp.o zram_drv.o zram_dedup.o
>
> obj-$(CONFIG_ZRAM) += zram.o
> diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
> new file mode 100644
> index 0000000..d313fc8
> --- /dev/null
> +++ b/drivers/block/zram/zram_dedup.c
> @@ -0,0 +1,222 @@
> +/*
> + * Copyright (C) 2017 Joonsoo Kim.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */
> +
> +#include <linux/vmalloc.h>
> +#include <linux/jhash.h>
> +
> +#include "zram_drv.h"
> +
> +/* One slot will contain 128 pages theoretically */
> +#define ZRAM_HASH_SHIFT 7
> +#define ZRAM_HASH_SIZE_MIN (1 << 10)
> +#define ZRAM_HASH_SIZE_MAX (1 << 31)
> +
> +u64 zram_dedup_dup_size(struct zram *zram)
> +{
> + return (u64)atomic64_read(&zram->stats.dup_data_size);
> +}
> +
> +u64 zram_dedup_meta_size(struct zram *zram)
> +{
> + return (u64)atomic64_read(&zram->stats.meta_data_size);
> +}
> +
> +static u32 zram_dedup_checksum(unsigned char *mem)
> +{
> + return jhash(mem, PAGE_SIZE, 0);
> +}
> +
> +void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
> + u32 checksum)
> +{
> + struct zram_meta *meta = zram->meta;
> + struct zram_hash *hash;
> + struct rb_root *rb_root;
> + struct rb_node **rb_node, *parent = NULL;
> + struct zram_entry *entry;
> +
> + new->checksum = checksum;
> + hash = &meta->hash[checksum % meta->hash_size];
> + rb_root = &hash->rb_root;
> +
> + spin_lock(&hash->lock);
> + rb_node = &rb_root->rb_node;
> + while (*rb_node) {
> + parent = *rb_node;
> + entry = rb_entry(parent, struct zram_entry, rb_node);
> + if (checksum < entry->checksum)
> + rb_node = &parent->rb_left;
> + else if (checksum > entry->checksum)
> + rb_node = &parent->rb_right;
> + else
> + rb_node = &parent->rb_left;
> + }
> +
> + rb_link_node(&new->rb_node, parent, rb_node);
> + rb_insert_color(&new->rb_node, rb_root);
> + spin_unlock(&hash->lock);
> +}
> +
> +static bool zram_dedup_match(struct zram *zram, struct zram_entry *entry,
> + unsigned char *mem)
> +{
> + bool match = false;
> + unsigned char *cmem;
> + struct zram_meta *meta = zram->meta;
> + struct zcomp_strm *zstrm;
> +
> + cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_RO);
> + if (entry->len == PAGE_SIZE) {
> + match = !memcmp(mem, cmem, PAGE_SIZE);
> + } else {
> + zstrm = zcomp_stream_get(zram->comp);
> + if (!zcomp_decompress(zstrm, cmem, entry->len, zstrm->buffer))
> + match = !memcmp(mem, zstrm->buffer, PAGE_SIZE);
> + zcomp_stream_put(zram->comp);
> + }
> + zs_unmap_object(meta->mem_pool, entry->handle);
> +
> + return match;
> +}
> +
> +static unsigned long zram_dedup_put(struct zram *zram, struct zram_meta *meta,
> + struct zram_entry *entry)
> +{
> + struct zram_hash *hash;
> + u32 checksum;
> + unsigned long refcount;
> +
> + checksum = entry->checksum;
> + hash = &meta->hash[checksum % meta->hash_size];
> +
> + spin_lock(&hash->lock);
> + entry->refcount--;
> + refcount = entry->refcount;
> + if (!entry->refcount) {
> + rb_erase(&entry->rb_node, &hash->rb_root);
> + RB_CLEAR_NODE(&entry->rb_node);
What's the purpose of this RB_CLEAR_NODE?
I think we don't need it.
> + } else if (zram) {
> + atomic64_sub(entry->len, &zram->stats.dup_data_size);
> + }
> + spin_unlock(&hash->lock);
> +
> + return refcount;
> +}
> +
> +static struct zram_entry *zram_dedup_get(struct zram *zram,
> + unsigned char *mem, u32 checksum)
> +{
> + struct zram_meta *meta = zram->meta;
> + struct zram_hash *hash;
> + struct zram_entry *entry;
> + struct rb_node *rb_node;
> +
> + hash = &meta->hash[checksum % meta->hash_size];
> +
> + spin_lock(&hash->lock);
> + rb_node = hash->rb_root.rb_node;
> + while (rb_node) {
> + entry = rb_entry(rb_node, struct zram_entry, rb_node);
> + if (checksum == entry->checksum) {
> + entry->refcount++;
> + atomic64_add(entry->len, &zram->stats.dup_data_size);
> + spin_unlock(&hash->lock);
> +
> + if (zram_dedup_match(zram, entry, mem))
> + return entry;
> +
> + zram_entry_free(zram, meta, entry);
> +
> + return NULL;
> + }
> +
> + if (checksum < entry->checksum)
> + rb_node = rb_node->rb_left;
> + else
> + rb_node = rb_node->rb_right;
> + }
> + spin_unlock(&hash->lock);
> +
> + return NULL;
> +}
> +
> +struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
> + u32 *checksum)
> +{
> + *checksum = zram_dedup_checksum(mem);
> +
> + return zram_dedup_get(zram, mem, *checksum);
> +}
> +
> +struct zram_entry *zram_dedup_alloc(struct zram *zram,
> + unsigned long handle, unsigned int len,
> + gfp_t flags)
> +{
> + struct zram_entry *entry;
> +
> + entry = kzalloc(sizeof(*entry),
> + flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
zram_entry_alloc can mask off GFP_HIGHMEM|__GFP_MOVABLE and
it can pass filtered gfp flags to zs_malloc and zram_dedup_alloc.
> + if (!entry)
> + return NULL;
> +
> + entry->handle = handle;
> + RB_CLEAR_NODE(&entry->rb_node);
Ditto. I think we don't need to clear rb_node.
> + entry->refcount = 1;
> + entry->len = len;
> +
> + atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);
> +
> + return entry;
> +}
> +
> +unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
> + struct zram_entry *entry)
> +{
> + unsigned long handle;
> +
> + if (zram_dedup_put(zram, meta, entry))
> + return 0;
> +
> + handle = entry->handle;
> + kfree(entry);
> +
> + /* !zram happens when reset/fail and updating stat is useless */
With recent version, it's not true so your code would be more clean. :)
> + if (zram)
> + atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
> +
> + return handle;
> +}
> +
> +int zram_dedup_init(struct zram_meta *meta, size_t num_pages)
> +{
> + int i;
> + struct zram_hash *hash;
> +
> + meta->hash_size = num_pages >> ZRAM_HASH_SHIFT;
> + meta->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, meta->hash_size);
> + meta->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, meta->hash_size);
> + meta->hash = vzalloc(meta->hash_size * sizeof(struct zram_hash));
> + if (!meta->hash) {
> + pr_err("Error allocating zram entry hash\n");
> + return -ENOMEM;
> + }
> +
> + for (i = 0; i < meta->hash_size; i++) {
> + hash = &meta->hash[i];
> + spin_lock_init(&hash->lock);
> + hash->rb_root = RB_ROOT;
> + }
> +
> + return 0;
> +}
> +
> +void zram_dedup_fini(struct zram_meta *meta)
> +{
> + vfree(meta->hash);
> +}
> diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h
> new file mode 100644
> index 0000000..7071f32
> --- /dev/null
> +++ b/drivers/block/zram/zram_dedup.h
> @@ -0,0 +1,25 @@
> +#ifndef _ZRAM_DEDUP_H_
> +#define _ZRAM_DEDUP_H_
> +
> +struct zram;
> +struct zram_meta;
> +struct zram_entry;
> +
> +u64 zram_dedup_dup_size(struct zram *zram);
> +u64 zram_dedup_meta_size(struct zram *zram);
> +
> +void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
> + u32 checksum);
> +struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
> + u32 *checksum);
> +
> +struct zram_entry *zram_dedup_alloc(struct zram *zram,
> + unsigned long handle, unsigned int len,
> + gfp_t flags);
> +unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
> + struct zram_entry *entry);
> +
> +int zram_dedup_init(struct zram_meta *meta, size_t num_pages);
> +void zram_dedup_fini(struct zram_meta *meta);
> +
> +#endif /* _ZRAM_DEDUP_H_ */
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index f3949da..15cecd6 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -385,14 +385,16 @@ static ssize_t mm_stat_show(struct device *dev,
> max_used = atomic_long_read(&zram->stats.max_used_pages);
>
> ret = scnprintf(buf, PAGE_SIZE,
> - "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
> + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
> orig_size << PAGE_SHIFT,
> (u64)atomic64_read(&zram->stats.compr_data_size),
> mem_used << PAGE_SHIFT,
> zram->limit_pages << PAGE_SHIFT,
> max_used << PAGE_SHIFT,
> (u64)atomic64_read(&zram->stats.same_pages),
> - pool_stats.pages_compacted);
> + pool_stats.pages_compacted,
> + zram_dedup_dup_size(zram),
> + zram_dedup_meta_size(zram));
> up_read(&zram->init_lock);
>
> return ret;
> @@ -424,25 +426,29 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,
> {
> struct zram_meta *meta = zram->meta;
> struct zram_entry *entry;
> + unsigned long handle;
>
> - entry = kzalloc(sizeof(*entry), flags);
> - if (!entry)
> + handle = zs_malloc(meta->mem_pool, len, flags);
> + if (!handle)
> return NULL;
>
> - entry->handle = zs_malloc(meta->mem_pool, len, flags);
> - if (!entry->handle) {
> - kfree(entry);
> + entry = zram_dedup_alloc(zram, handle, len, flags);
I don't like to hide zram_entry allocation behind zram_dedup_alloc.
It's never wrong because zram_table allocation happens on only
with dedup feature but zram_table is owned by zram_drv so I want
for zram_drv to control zram_table alloc/free.
How about this?
zram_entry_alloc
unsigned long handle;
struct zram_entry *entry;
handle = zs_malloc;
if (zram_dedup_enabled())
return handle;
entry = kzalloc();
zram_dedup_init(entry, handle, len);
return entry;
> + if (!entry) {
> + zs_free(meta->mem_pool, handle);
> return NULL;
> }
>
> return entry;
> }
>
> -static inline void zram_entry_free(struct zram_meta *meta,
> - struct zram_entry *entry)
> +void zram_entry_free(struct zram *zram, struct zram_meta *meta,
> + struct zram_entry *entry)
> {
> - zs_free(meta->mem_pool, entry->handle);
> - kfree(entry);
> + unsigned long handle;
> +
> + handle = zram_dedup_free(zram, meta, entry);
Ditto.
unsigned long handle = entry->handle;
if (zram_dedup_put(zram, meta, entry))
kfree(entry);
zs_free(meta->mem_pool, handle);
With this, In [4/4], I want that zram_entry_handle
doesn't rely on zram_dedup_handle.
unsigned long zram_entry_handle(strucdt zram *zram, strucdt zram_entry *entry)
{
unsigned long handle;
if (!zram_dedup_enabled(zram->meta))
handle = (unsigned long)entry;
else
handle = entry->handle;
return handle;
}
> + if (handle)
> + zs_free(meta->mem_pool, handle);
> }
On Mon, Apr 17, 2017 at 10:38:16AM +0900, Minchan Kim wrote:
> Hi Joonsoo,
>
> I reviewed this patch and overall, looks great! Thanks.
Thanks!
> However, as you know, recently, zram had lots of clean up so
> this patchset should be rebased on it massively.
> Sorry for the inconvenience.
No problem.
>
> And there are some minor in below. I hope you handle them in
> next submit, please.
Okay.
> On Thu, Mar 30, 2017 at 02:38:07PM +0900, [email protected] wrote:
> > From: Joonsoo Kim <[email protected]>
> >
> > This patch implements deduplication feature in zram. The purpose
> > of this work is naturally to save amount of memory usage by zram.
> >
> > Android is one of the biggest users to use zram as swap and it's
> > really important to save amount of memory usage. There is a paper
> > that reports that duplication ratio of Android's memory content is
> > rather high [1]. And, there is a similar work on zswap that also
> > reports that experiments has shown that around 10-15% of pages
> > stored in zswp are duplicates and deduplicate them provides some
> > benefits [2].
> >
> > Also, there is a different kind of workload that uses zram as blockdev
> > and store build outputs into it to reduce wear-out problem of real
> > blockdev. In this workload, deduplication hit is very high due to
> > temporary files and intermediate object files. Detailed analysis is
> > on the bottom of this description.
> >
> > Anyway, if we can detect duplicated content and avoid to store duplicated
> > content at different memory space, we can save memory. This patch
> > tries to do that.
> >
> > Implementation is almost simple and intuitive but I should note
> > one thing about implementation detail.
> >
> > To check duplication, this patch uses checksum of the page and
> > collision of this checksum could be possible. There would be
> > many choices to handle this situation but this patch chooses
> > to allow entry with duplicated checksum to be added to the hash,
> > but, not to compare all entries with duplicated checksum
> > when checking duplication. I guess that checksum collision is quite
> > rare event and we don't need to pay any attention to such a case.
> > Therefore, I decided the most simplest way to implement the feature.
> > If there is a different opinion, I can accept and go that way.
> >
> > Following is the result of this patch.
> >
> > Test result #1 (Swap):
> > Android Marshmallow, emulator, x86_64, Backporting to kernel v3.18
> >
> > orig_data_size: 145297408
> > compr_data_size: 32408125
> > mem_used_total: 32276480
> > dup_data_size: 3188134
> > meta_data_size: 1444272
> >
> > Last two metrics added to mm_stat are related to this work.
> > First one, dup_data_size, is amount of saved memory by avoiding
> > to store duplicated page. Later one, meta_data_size, is the amount of
> > data structure to support deduplication. If dup > meta, we can judge
> > that the patch improves memory usage.
> >
> > In Adnroid, we can save 5% of memory usage by this work.
> >
> > Test result #2 (Blockdev):
> > build the kernel and store output to ext4 FS on zram
> >
> > <no-dedup>
> > Elapsed time: 249 s
> > mm_stat: 430845952 191014886 196898816 0 196898816 28320 0 0 0
> >
> > <dedup>
> > Elapsed time: 250 s
> > mm_stat: 430505984 190971334 148365312 0 148365312 28404 0 47287038 3945792
> >
> > There is no performance degradation and save 23% memory.
> >
> > Test result #3 (Blockdev):
> > copy android build output dir(out/host) to ext4 FS on zram
> >
> > <no-dedup>
> > Elapsed time: out/host: 88 s
> > mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
> >
> > <dedup>
> > Elapsed time: out/host: 100 s
> > mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336
> >
> > It shows performance degradation roughly 13% and save 24% memory. Maybe,
> > it is due to overhead of calculating checksum and comparison.
> >
> > Test result #4 (Blockdev):
> > copy android build output dir(out/target/common) to ext4 FS on zram
> >
> > <no-dedup>
> > Elapsed time: out/host: 203 s
> > mm_stat: 4041678848 2310355010 2346577920 0 2346582016 500 4 0 0
> >
> > <dedup>
> > Elapsed time: out/host: 201 s
> > mm_stat: 4041666560 2310488276 1338150912 0 1338150912 476 0 989088794 24564336
> >
> > Memory is saved by 42% and performance is the same. Even if there is overhead
> > of calculating checksum and comparison, large hit ratio compensate it since
> > hit leads to less compression attempt.
> >
> > I checked the detailed reason of savings on kernel build workload and
> > there are some cases that deduplication happens.
> >
> > 1) *.cmd
> > Build command is usually similar in one directory so content of these file
> > are very similar. In my system, more than 789 lines in fs/ext4/.namei.o.cmd
> > and fs/ext4/.inode.o.cmd are the same in 944 and 938 lines of the file,
> > respectively.
> >
> > 2) intermediate object files
> > built-in.o and temporary object file have the similar contents. More than
> > 50% of fs/ext4/ext4.o is the same with fs/ext4/built-in.o.
> >
> > 3) vmlinux
> > .tmp_vmlinux1 and .tmp_vmlinux2 and arch/x86/boo/compressed/vmlinux.bin
> > have the similar contents.
> >
> > Android test has similar case that some of object files(.class
> > and .so) are similar with another ones.
> > (./host/linux-x86/lib/libartd.so and
> > ./host/linux-x86-lib/libartd-comiler.so)
> >
> > Anyway, benefit seems to be largely dependent on the workload so
> > following patch will make this feature optional. However, this feature
> > can help some usecases so is deserved to be merged.
> >
> > [1]: MemScope: Analyzing Memory Duplication on Android Systems,
> > dl.acm.org/citation.cfm?id=2797023
> > [2]: zswap: Optimize compressed pool memory utilization,
> > lkml.kernel.org/r/1341407574.7551.1471584870761.JavaMail.weblogic@epwas3p2
> >
> > Signed-off-by: Joonsoo Kim <[email protected]>
> > ---
> > Documentation/blockdev/zram.txt | 2 +
> > drivers/block/zram/Makefile | 2 +-
> > drivers/block/zram/zram_dedup.c | 222 ++++++++++++++++++++++++++++++++++++++++
> > drivers/block/zram/zram_dedup.h | 25 +++++
> > drivers/block/zram/zram_drv.c | 62 ++++++++---
> > drivers/block/zram/zram_drv.h | 18 ++++
> > 6 files changed, 314 insertions(+), 17 deletions(-)
> > create mode 100644 drivers/block/zram/zram_dedup.c
> > create mode 100644 drivers/block/zram/zram_dedup.h
> >
> > diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
> > index 4fced8a..2cdc303 100644
> > --- a/Documentation/blockdev/zram.txt
> > +++ b/Documentation/blockdev/zram.txt
> > @@ -217,6 +217,8 @@ line of text and contains the following stats separated by whitespace:
> > same_pages the number of same element filled pages written to this disk.
> > No memory is allocated for such pages.
> > pages_compacted the number of pages freed during compaction
> > + dup_data_size deduplicated data size
> > + meta_data_size the amount of metadata allocated for deduplication feature
> >
> > 9) Deactivate:
> > swapoff /dev/zram0
> > diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
> > index 9e2b79e..29cb008 100644
> > --- a/drivers/block/zram/Makefile
> > +++ b/drivers/block/zram/Makefile
> > @@ -1,3 +1,3 @@
> > -zram-y := zcomp.o zram_drv.o
> > +zram-y := zcomp.o zram_drv.o zram_dedup.o
> >
> > obj-$(CONFIG_ZRAM) += zram.o
> > diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
> > new file mode 100644
> > index 0000000..d313fc8
> > --- /dev/null
> > +++ b/drivers/block/zram/zram_dedup.c
> > @@ -0,0 +1,222 @@
> > +/*
> > + * Copyright (C) 2017 Joonsoo Kim.
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public License
> > + * as published by the Free Software Foundation; either version
> > + * 2 of the License, or (at your option) any later version.
> > + */
> > +
> > +#include <linux/vmalloc.h>
> > +#include <linux/jhash.h>
> > +
> > +#include "zram_drv.h"
> > +
> > +/* One slot will contain 128 pages theoretically */
> > +#define ZRAM_HASH_SHIFT 7
> > +#define ZRAM_HASH_SIZE_MIN (1 << 10)
> > +#define ZRAM_HASH_SIZE_MAX (1 << 31)
> > +
> > +u64 zram_dedup_dup_size(struct zram *zram)
> > +{
> > + return (u64)atomic64_read(&zram->stats.dup_data_size);
> > +}
> > +
> > +u64 zram_dedup_meta_size(struct zram *zram)
> > +{
> > + return (u64)atomic64_read(&zram->stats.meta_data_size);
> > +}
> > +
> > +static u32 zram_dedup_checksum(unsigned char *mem)
> > +{
> > + return jhash(mem, PAGE_SIZE, 0);
> > +}
> > +
> > +void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
> > + u32 checksum)
> > +{
> > + struct zram_meta *meta = zram->meta;
> > + struct zram_hash *hash;
> > + struct rb_root *rb_root;
> > + struct rb_node **rb_node, *parent = NULL;
> > + struct zram_entry *entry;
> > +
> > + new->checksum = checksum;
> > + hash = &meta->hash[checksum % meta->hash_size];
> > + rb_root = &hash->rb_root;
> > +
> > + spin_lock(&hash->lock);
> > + rb_node = &rb_root->rb_node;
> > + while (*rb_node) {
> > + parent = *rb_node;
> > + entry = rb_entry(parent, struct zram_entry, rb_node);
> > + if (checksum < entry->checksum)
> > + rb_node = &parent->rb_left;
> > + else if (checksum > entry->checksum)
> > + rb_node = &parent->rb_right;
> > + else
> > + rb_node = &parent->rb_left;
> > + }
> > +
> > + rb_link_node(&new->rb_node, parent, rb_node);
> > + rb_insert_color(&new->rb_node, rb_root);
> > + spin_unlock(&hash->lock);
> > +}
> > +
> > +static bool zram_dedup_match(struct zram *zram, struct zram_entry *entry,
> > + unsigned char *mem)
> > +{
> > + bool match = false;
> > + unsigned char *cmem;
> > + struct zram_meta *meta = zram->meta;
> > + struct zcomp_strm *zstrm;
> > +
> > + cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_RO);
> > + if (entry->len == PAGE_SIZE) {
> > + match = !memcmp(mem, cmem, PAGE_SIZE);
> > + } else {
> > + zstrm = zcomp_stream_get(zram->comp);
> > + if (!zcomp_decompress(zstrm, cmem, entry->len, zstrm->buffer))
> > + match = !memcmp(mem, zstrm->buffer, PAGE_SIZE);
> > + zcomp_stream_put(zram->comp);
> > + }
> > + zs_unmap_object(meta->mem_pool, entry->handle);
> > +
> > + return match;
> > +}
> > +
> > +static unsigned long zram_dedup_put(struct zram *zram, struct zram_meta *meta,
> > + struct zram_entry *entry)
> > +{
> > + struct zram_hash *hash;
> > + u32 checksum;
> > + unsigned long refcount;
> > +
> > + checksum = entry->checksum;
> > + hash = &meta->hash[checksum % meta->hash_size];
> > +
> > + spin_lock(&hash->lock);
> > + entry->refcount--;
> > + refcount = entry->refcount;
> > + if (!entry->refcount) {
> > + rb_erase(&entry->rb_node, &hash->rb_root);
> > + RB_CLEAR_NODE(&entry->rb_node);
>
> What's the purpose of this RB_CLEAR_NODE?
> I think we don't need it.
I thought that it is needed to initialize the rb entry but with
re-check it seems to be unnecessary. I will remove it.
> > + } else if (zram) {
> > + atomic64_sub(entry->len, &zram->stats.dup_data_size);
> > + }
> > + spin_unlock(&hash->lock);
> > +
> > + return refcount;
> > +}
> > +
> > +static struct zram_entry *zram_dedup_get(struct zram *zram,
> > + unsigned char *mem, u32 checksum)
> > +{
> > + struct zram_meta *meta = zram->meta;
> > + struct zram_hash *hash;
> > + struct zram_entry *entry;
> > + struct rb_node *rb_node;
> > +
> > + hash = &meta->hash[checksum % meta->hash_size];
> > +
> > + spin_lock(&hash->lock);
> > + rb_node = hash->rb_root.rb_node;
> > + while (rb_node) {
> > + entry = rb_entry(rb_node, struct zram_entry, rb_node);
> > + if (checksum == entry->checksum) {
> > + entry->refcount++;
> > + atomic64_add(entry->len, &zram->stats.dup_data_size);
> > + spin_unlock(&hash->lock);
> > +
> > + if (zram_dedup_match(zram, entry, mem))
> > + return entry;
> > +
> > + zram_entry_free(zram, meta, entry);
> > +
> > + return NULL;
> > + }
> > +
> > + if (checksum < entry->checksum)
> > + rb_node = rb_node->rb_left;
> > + else
> > + rb_node = rb_node->rb_right;
> > + }
> > + spin_unlock(&hash->lock);
> > +
> > + return NULL;
> > +}
> > +
> > +struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
> > + u32 *checksum)
> > +{
> > + *checksum = zram_dedup_checksum(mem);
> > +
> > + return zram_dedup_get(zram, mem, *checksum);
> > +}
> > +
> > +struct zram_entry *zram_dedup_alloc(struct zram *zram,
> > + unsigned long handle, unsigned int len,
> > + gfp_t flags)
> > +{
> > + struct zram_entry *entry;
> > +
> > + entry = kzalloc(sizeof(*entry),
> > + flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
>
> zram_entry_alloc can mask off GFP_HIGHMEM|__GFP_MOVABLE and
> it can pass filtered gfp flags to zs_malloc and zram_dedup_alloc.
Okay.
> > + if (!entry)
> > + return NULL;
> > +
> > + entry->handle = handle;
> > + RB_CLEAR_NODE(&entry->rb_node);
>
> Ditto. I think we don't need to clear rb_node.
Okay.
> > + entry->refcount = 1;
> > + entry->len = len;
> > +
> > + atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);
> > +
> > + return entry;
> > +}
> > +
> > +unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
> > + struct zram_entry *entry)
> > +{
> > + unsigned long handle;
> > +
> > + if (zram_dedup_put(zram, meta, entry))
> > + return 0;
> > +
> > + handle = entry->handle;
> > + kfree(entry);
> > +
> > + /* !zram happens when reset/fail and updating stat is useless */
>
> With recent version, it's not true so your code would be more clean. :)
Good!
>
> > + if (zram)
> > + atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
> > +
> > + return handle;
> > +}
> > +
> > +int zram_dedup_init(struct zram_meta *meta, size_t num_pages)
> > +{
> > + int i;
> > + struct zram_hash *hash;
> > +
> > + meta->hash_size = num_pages >> ZRAM_HASH_SHIFT;
> > + meta->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, meta->hash_size);
> > + meta->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, meta->hash_size);
> > + meta->hash = vzalloc(meta->hash_size * sizeof(struct zram_hash));
> > + if (!meta->hash) {
> > + pr_err("Error allocating zram entry hash\n");
> > + return -ENOMEM;
> > + }
> > +
> > + for (i = 0; i < meta->hash_size; i++) {
> > + hash = &meta->hash[i];
> > + spin_lock_init(&hash->lock);
> > + hash->rb_root = RB_ROOT;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +void zram_dedup_fini(struct zram_meta *meta)
> > +{
> > + vfree(meta->hash);
> > +}
> > diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h
> > new file mode 100644
> > index 0000000..7071f32
> > --- /dev/null
> > +++ b/drivers/block/zram/zram_dedup.h
> > @@ -0,0 +1,25 @@
> > +#ifndef _ZRAM_DEDUP_H_
> > +#define _ZRAM_DEDUP_H_
> > +
> > +struct zram;
> > +struct zram_meta;
> > +struct zram_entry;
> > +
> > +u64 zram_dedup_dup_size(struct zram *zram);
> > +u64 zram_dedup_meta_size(struct zram *zram);
> > +
> > +void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
> > + u32 checksum);
> > +struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
> > + u32 *checksum);
> > +
> > +struct zram_entry *zram_dedup_alloc(struct zram *zram,
> > + unsigned long handle, unsigned int len,
> > + gfp_t flags);
> > +unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
> > + struct zram_entry *entry);
> > +
> > +int zram_dedup_init(struct zram_meta *meta, size_t num_pages);
> > +void zram_dedup_fini(struct zram_meta *meta);
> > +
> > +#endif /* _ZRAM_DEDUP_H_ */
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index f3949da..15cecd6 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -385,14 +385,16 @@ static ssize_t mm_stat_show(struct device *dev,
> > max_used = atomic_long_read(&zram->stats.max_used_pages);
> >
> > ret = scnprintf(buf, PAGE_SIZE,
> > - "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
> > + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
> > orig_size << PAGE_SHIFT,
> > (u64)atomic64_read(&zram->stats.compr_data_size),
> > mem_used << PAGE_SHIFT,
> > zram->limit_pages << PAGE_SHIFT,
> > max_used << PAGE_SHIFT,
> > (u64)atomic64_read(&zram->stats.same_pages),
> > - pool_stats.pages_compacted);
> > + pool_stats.pages_compacted,
> > + zram_dedup_dup_size(zram),
> > + zram_dedup_meta_size(zram));
> > up_read(&zram->init_lock);
> >
> > return ret;
> > @@ -424,25 +426,29 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,
> > {
> > struct zram_meta *meta = zram->meta;
> > struct zram_entry *entry;
> > + unsigned long handle;
> >
> > - entry = kzalloc(sizeof(*entry), flags);
> > - if (!entry)
> > + handle = zs_malloc(meta->mem_pool, len, flags);
> > + if (!handle)
> > return NULL;
> >
> > - entry->handle = zs_malloc(meta->mem_pool, len, flags);
> > - if (!entry->handle) {
> > - kfree(entry);
> > + entry = zram_dedup_alloc(zram, handle, len, flags);
>
> I don't like to hide zram_entry allocation behind zram_dedup_alloc.
> It's never wrong because zram_table allocation happens on only
> with dedup feature but zram_table is owned by zram_drv so I want
> for zram_drv to control zram_table alloc/free.
Okay.
> How about this?
>
> zram_entry_alloc
> unsigned long handle;
> struct zram_entry *entry;
>
> handle = zs_malloc;
> if (zram_dedup_enabled())
> return handle;
>
> entry = kzalloc();
> zram_dedup_init(entry, handle, len);
>
> return entry;
>
>
I will change it as you suggested.
> > + if (!entry) {
> > + zs_free(meta->mem_pool, handle);
> > return NULL;
> > }
> >
> > return entry;
> > }
> >
> > -static inline void zram_entry_free(struct zram_meta *meta,
> > - struct zram_entry *entry)
> > +void zram_entry_free(struct zram *zram, struct zram_meta *meta,
> > + struct zram_entry *entry)
> > {
> > - zs_free(meta->mem_pool, entry->handle);
> > - kfree(entry);
> > + unsigned long handle;
> > +
> > + handle = zram_dedup_free(zram, meta, entry);
>
> Ditto.
> unsigned long handle = entry->handle;
>
> if (zram_dedup_put(zram, meta, entry))
> kfree(entry);
>
> zs_free(meta->mem_pool, handle);
>
> With this, In [4/4], I want that zram_entry_handle
> doesn't rely on zram_dedup_handle.
Okay.
Thanks.