LinuxLists.cc - [PATCH 0/7] zram bug fix and lock redesign

2014-01-13 11:19:19

Subject: [PATCH 0/7] zram bug fix and lock redesign

This patchset includes a bug fix and clean up patchset
to resolve lock mess.
Finally, last patch enhances write path significantly.

Minchan Kim (7):
zram: fix race between reset and flushing pending work
zram: delay pending free request in read path
zram: remove unnecessary free
zram: use atomic operation for stat
zram: introduce zram->tb_lock
zram: remove workqueue for freeing removed pending slot
zram: remove unnecessary lock

drivers/block/zram/zram_drv.c | 126 +++++++++++++++---------------------------
drivers/block/zram/zram_drv.h | 27 ++-------
2 files changed, 51 insertions(+), 102 deletions(-)

--
1.8.4.3

2014-01-13 11:19:26

by Minchan Kim

[permalink] [raw]

Subject: [PATCH 1/7] zram: fix race between reset and flushing pending work

Dan and Sergey reported that there is a racy between reset and
flushing of pending work so that it could make oops by freeing
zram->meta in reset while zram_slot_free can access zram->meta
if new request is adding during the race window.

This patch moves flush after taking init_lock so it prevents
new request so that it closes the race.

Cc: stable <[email protected]>
Signed-off-by: Minchan Kim <[email protected]>
---
drivers/block/zram/zram_drv.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f9711c5..213dfc1 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -553,14 +553,14 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
size_t index;
struct zram_meta *meta;

- flush_work(&zram->free_work);
-
down_write(&zram->init_lock);
if (!zram->init_done) {
up_write(&zram->init_lock);
return;
}

+ flush_work(&zram->free_work);
+
meta = zram->meta;
zram->init_done = 0;

--
1.8.4.3

2014-01-13 11:19:32

by Minchan Kim

[permalink] [raw]

Subject: [PATCH 3/7] zram: remove unnecessary free

[1] introduced pending zram slot free in zram's write path
in case of missing slot free by memory allocation failure in
zram_slot_free_notify but it is not necessary because
we have already freed the slot right before overwriting.

[1] [a0c516cbfc, zram: don't grab mutex in zram_slot_free_noity]

Signed-off-by: Minchan Kim <[email protected]>
---
drivers/block/zram/zram_drv.c | 8 --------
1 file changed, 8 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d87c7e9..ebfddd8 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -441,14 +441,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}

- /*
- * zram_slot_free_notify could miss free so that let's
- * double check.
- */
- if (unlikely(meta->table[index].handle ||
- zram_test_flag(meta, index, ZRAM_ZERO)))
- zram_free_page(zram, index);
-
ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen,
meta->compress_workmem);

--
1.8.4.3

2014-01-13 11:19:37

by Minchan Kim

[permalink] [raw]

Subject: [PATCH 5/7] zram: introduce zram->tb_lock

Currently, table is protected by zram->lock but it's rather
coarse-grained lock and it makes hard for scalibility.

Let's use own lock instead of depending on zram->lock.

Signed-off-by: Minchan Kim <[email protected]>
---
drivers/block/zram/zram_drv.c | 26 +++++++++++++++++++++-----
drivers/block/zram/zram_drv.h | 3 ++-
2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9ab8849..24e6426 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -140,6 +140,7 @@ static ssize_t mem_used_total_show(struct device *dev,
return sprintf(buf, "%llu\n", val);
}

+/* flag operations needs meta->tb_lock */
static int zram_test_flag(struct zram_meta *meta, u32 index,
enum zram_pageflags flag)
{
@@ -228,6 +229,7 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
goto free_table;
}

+ rwlock_init(&meta->tb_lock);
return meta;

free_table:
@@ -280,6 +282,7 @@ static void handle_zero_page(struct bio_vec *bvec)
flush_dcache_page(page);
}

+/* NOTE: caller should hold meta->tb_lock with write-side */
static void zram_free_page(struct zram *zram, size_t index)
{
struct zram_meta *meta = zram->meta;
@@ -319,20 +322,26 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
size_t clen = PAGE_SIZE;
unsigned char *cmem;
struct zram_meta *meta = zram->meta;
- unsigned long handle = meta->table[index].handle;
+ unsigned long handle;
+ u16 size;
+
+ read_lock(&meta->tb_lock);
+ handle = meta->table[index].handle;
+ size = meta->table[index].size;

if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
+ read_unlock(&meta->tb_lock);
clear_page(mem);
return 0;
}

cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
- if (meta->table[index].size == PAGE_SIZE)
+ if (size == PAGE_SIZE)
copy_page(mem, cmem);
else
- ret = lzo1x_decompress_safe(cmem, meta->table[index].size,
- mem, &clen);
+ ret = lzo1x_decompress_safe(cmem, size, mem, &clen);
zs_unmap_object(meta->mem_pool, handle);
+ read_unlock(&meta->tb_lock);

/* Should NEVER happen. Return bio error if it does. */
if (unlikely(ret != LZO_E_OK)) {
@@ -353,11 +362,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
struct zram_meta *meta = zram->meta;
page = bvec->bv_page;

+ read_lock(&meta->tb_lock);
if (unlikely(!meta->table[index].handle) ||
zram_test_flag(meta, index, ZRAM_ZERO)) {
+ read_unlock(&meta->tb_lock);
handle_zero_page(bvec);
return 0;
}
+ read_unlock(&meta->tb_lock);

if (is_partial_io(bvec))
/* Use a temporary buffer to decompress the page */
@@ -433,10 +445,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
if (page_zero_filled(uncmem)) {
kunmap_atomic(user_mem);
/* Free memory associated with this sector now. */
+ write_lock(&zram->meta->tb_lock);
zram_free_page(zram, index);
+ zram_set_flag(meta, index, ZRAM_ZERO);
+ write_unlock(&zram->meta->tb_lock);

atomic_inc(&zram->stats.pages_zero);
- zram_set_flag(meta, index, ZRAM_ZERO);
ret = 0;
goto out;
}
@@ -486,10 +500,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
* Free memory associated with this sector
* before overwriting unused sectors.
*/
+ write_lock(&zram->meta->tb_lock);
zram_free_page(zram, index);

meta->table[index].handle = handle;
meta->table[index].size = clen;
+ write_unlock(&zram->meta->tb_lock);

/* Update stats */
atomic64_add(clen, &zram->stats.compr_size);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 81b0170..c3f453f 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -83,6 +83,7 @@ struct zram_stats {
};

struct zram_meta {
+ rwlock_t tb_lock; /* protect table */
void *compress_workmem;
void *compress_buffer;
struct table *table;
@@ -96,7 +97,7 @@ struct zram_slot_free {

struct zram {
struct zram_meta *meta;
- struct rw_semaphore lock; /* protect compression buffers, table,
+ struct rw_semaphore lock; /* protect compression buffers,
* reads and writes
*/

--
1.8.4.3

2014-01-13 11:19:45

by Minchan Kim

[permalink] [raw]

Subject: [PATCH 7/7] zram: remove unnecessary lock

read/write lock's performance is really bad compared to
mutex_lock in write most workload.(AFAIR, recenlty there
were some effort to enhance it but not sure it got merged).

Anyway, we don't need such big granuarity read-write lock
any more so this patch replaces read/write lock with mutex.

CPU 12
iozone -t -T -l 12 -u 12 -r 16K -s 60M -I +Z -V 0

==Initial write ==Initial write
records: 10 records: 10
avg: 516189.16 avg: 839907.96
std: 22486.53 std: 47902.17
max: 546970.60 max: 909910.35
min: 481131.54 min: 751148.38
==Rewrite ==Rewrite
records: 10 records: 10
avg: 509527.98 avg: 1050156.37
std: 45799.94 std: 40695.44
max: 611574.27 max: 1111929.26
min: 443679.95 min: 980409.62
==Read ==Read
records: 10 records: 10
avg: 4408624.17 avg: 4472546.76
std: 281152.61 std: 163662.78
max: 4867888.66 max: 4727351.03
min: 4058347.69 min: 4126520.88
==Re-read ==Re-read
records: 10 records: 10
avg: 4462147.53 avg: 4363257.75
std: 283546.11 std: 247292.63
max: 4912894.44 max: 4677241.75
min: 4131386.50 min: 4035235.84
==Reverse Read ==Reverse Read
records: 10 records: 10
avg: 4565865.97 avg: 4485818.08
std: 313395.63 std: 248470.10
max: 5232749.16 max: 4789749.94
min: 4185809.62 min: 3963081.34
==Stride read ==Stride read
records: 10 records: 10
avg: 4515981.80 avg: 4418806.01
std: 211192.32 std: 212837.97
max: 4889287.28 max: 4686967.22
min: 4210362.00 min: 4083041.84
==Random read ==Random read
records: 10 records: 10
avg: 4410525.23 avg: 4387093.18
std: 236693.22 std: 235285.23
max: 4713698.47 max: 4669760.62
min: 4057163.62 min: 3952002.16
==Mixed workload ==Mixed workload
records: 10 records: 10
avg: 243234.25 avg: 2818677.27
std: 28505.07 std: 195569.70
max: 288905.23 max: 3126478.11
min: 212473.16 min: 2484150.69
==Random write ==Random write
records: 10 records: 10
avg: 555887.07 avg: 1053057.79
std: 70841.98 std: 35195.36
max: 683188.28 max: 1096125.73
min: 437299.57 min: 992481.93
==Pwrite ==Pwrite
records: 10 records: 10
avg: 501745.93 avg: 810363.09
std: 16373.54 std: 19245.01
max: 518724.52 max: 833359.70
min: 464208.73 min: 765501.87
==Pread ==Pread
records: 10 records: 10
avg: 4539894.60 avg: 4457680.58
std: 197094.66 std: 188965.60
max: 4877170.38 max: 4689905.53
min: 4226326.03 min: 4095739.72

Read side seem to be a bit slower than old but I believe it's not
bad deal if we consider increased performance of write side and
code readability.

Signed-off-by: Minchan Kim <[email protected]>
---
drivers/block/zram/zram_drv.c | 17 ++++++++---------
drivers/block/zram/zram_drv.h | 4 +---
2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f1a3c95..011e55d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -230,6 +230,7 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
}

rwlock_init(&meta->tb_lock);
+ mutex_init(&meta->buffer_lock);
return meta;

free_table:
@@ -412,6 +413,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
struct page *page;
unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
struct zram_meta *meta = zram->meta;
+ bool locked = false;

page = bvec->bv_page;
src = meta->compress_buffer;
@@ -431,6 +433,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}

+ mutex_lock(&meta->buffer_lock);
+ locked = true;
user_mem = kmap_atomic(page);

if (is_partial_io(bvec)) {
@@ -457,7 +461,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen,
meta->compress_workmem);
-
if (!is_partial_io(bvec)) {
kunmap_atomic(user_mem);
user_mem = NULL;
@@ -514,6 +517,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
atomic_inc(&zram->stats.good_compress);

out:
+ if (locked)
+ mutex_unlock(&meta->buffer_lock);
if (is_partial_io(bvec))
kfree(uncmem);

@@ -527,15 +532,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
{
int ret;

- if (rw == READ) {
- down_read(&zram->lock);
+ if (rw == READ)
ret = zram_bvec_read(zram, bvec, index, offset, bio);
- up_read(&zram->lock);
- } else {
- down_write(&zram->lock);
+ else
ret = zram_bvec_write(zram, bvec, index, offset);
- up_write(&zram->lock);
- }

return ret;
}
@@ -808,7 +808,6 @@ static int create_device(struct zram *zram, int device_id)
{
int ret = -ENOMEM;

- init_rwsem(&zram->lock);
init_rwsem(&zram->init_lock);

zram->queue = blk_alloc_queue(GFP_KERNEL);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index d876300..ad8aa35 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -88,13 +88,11 @@ struct zram_meta {
void *compress_buffer;
struct table *table;
struct zs_pool *mem_pool;
+ struct mutex buffer_lock; /* protect compress buffers */
};

struct zram {
struct zram_meta *meta;
- struct rw_semaphore lock; /* protect compression buffers,
- * reads and writes
- */
struct request_queue *queue;
struct gendisk *disk;
int init_done;
--
1.8.4.3

2014-01-13 11:20:01

by Minchan Kim

[permalink] [raw]

Subject: [PATCH 6/7] zram: remove workqueue for freeing removed pending slot

[1] introduced free request pending code to avoid scheduling
by mutex under spinlock and it was a mess which made code
lenghty and increased overhead.

Now, we don't need zram->lock any more to free slot so
this patch reverts it and then, tb_lock should protect it.

[1] a0c516c, zram: don't grab mutex in zram_slot_free_noity
Signed-off-by: Minchan Kim <[email protected]>
---
drivers/block/zram/zram_drv.c | 54 +++++--------------------------------------
drivers/block/zram/zram_drv.h | 10 --------
2 files changed, 6 insertions(+), 58 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 24e6426..f1a3c95 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -522,20 +522,6 @@ out:
return ret;
}

-static void handle_pending_slot_free(struct zram *zram)
-{
- struct zram_slot_free *free_rq;
-
- spin_lock(&zram->slot_free_lock);
- while (zram->slot_free_rq) {
- free_rq = zram->slot_free_rq;
- zram->slot_free_rq = free_rq->next;
- zram_free_page(zram, free_rq->index);
- kfree(free_rq);
- }
- spin_unlock(&zram->slot_free_lock);
-}
-
static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
int offset, struct bio *bio, int rw)
{
@@ -547,7 +533,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
up_read(&zram->lock);
} else {
down_write(&zram->lock);
- handle_pending_slot_free(zram);
ret = zram_bvec_write(zram, bvec, index, offset);
up_write(&zram->lock);
}
@@ -566,8 +551,6 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
return;
}

- flush_work(&zram->free_work);
-
meta = zram->meta;
zram->init_done = 0;

@@ -769,40 +752,19 @@ error:
bio_io_error(bio);
}

-static void zram_slot_free(struct work_struct *work)
-{
- struct zram *zram;
-
- zram = container_of(work, struct zram, free_work);
- down_write(&zram->lock);
- handle_pending_slot_free(zram);
- up_write(&zram->lock);
-}
-
-static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq)
-{
- spin_lock(&zram->slot_free_lock);
- free_rq->next = zram->slot_free_rq;
- zram->slot_free_rq = free_rq;
- spin_unlock(&zram->slot_free_lock);
-}
-
static void zram_slot_free_notify(struct block_device *bdev,
unsigned long index)
{
struct zram *zram;
- struct zram_slot_free *free_rq;
+ struct zram_meta *meta;

zram = bdev->bd_disk->private_data;
- atomic64_inc(&zram->stats.notify_free);
-
- free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC);
- if (!free_rq)
- return;
+ meta = zram->meta;

- free_rq->index = index;
- add_slot_free(zram, free_rq);
- schedule_work(&zram->free_work);
+ write_lock(&meta->tb_lock);
+ zram_free_page(zram, index);
+ write_unlock(&meta->tb_lock);
+ atomic64_inc(&zram->stats.notify_free);
}

static const struct block_device_operations zram_devops = {
@@ -849,10 +811,6 @@ static int create_device(struct zram *zram, int device_id)
init_rwsem(&zram->lock);
init_rwsem(&zram->init_lock);

- INIT_WORK(&zram->free_work, zram_slot_free);
- spin_lock_init(&zram->slot_free_lock);
- zram->slot_free_rq = NULL;
-
zram->queue = blk_alloc_queue(GFP_KERNEL);
if (!zram->queue) {
pr_err("Error allocating disk queue for device %d\n",
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index c3f453f..d876300 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -90,20 +90,11 @@ struct zram_meta {
struct zs_pool *mem_pool;
};

-struct zram_slot_free {
- unsigned long index;
- struct zram_slot_free *next;
-};
-
struct zram {
struct zram_meta *meta;
struct rw_semaphore lock; /* protect compression buffers,
* reads and writes
*/
-
- struct work_struct free_work; /* handle pending free request */
- struct zram_slot_free *slot_free_rq; /* list head of free request */
-
struct request_queue *queue;
struct gendisk *disk;
int init_done;
@@ -114,7 +105,6 @@ struct zram {
* we can store in a disk.
*/
u64 disksize; /* bytes */
- spinlock_t slot_free_lock;

struct zram_stats stats;
};
--
1.8.4.3

2014-01-13 11:20:38

by Minchan Kim

[permalink] [raw]

Subject: [PATCH 4/7] zram: use atomic operation for stat

Some of fields in zram->stats are protected by zram->lock which
is rather coarse-grained so let's use atomic operation without
explict locking.

Signed-off-by: Minchan Kim <[email protected]>
---
drivers/block/zram/zram_drv.c | 20 ++++++++++----------
drivers/block/zram/zram_drv.h | 16 ++++++----------
2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index ebfddd8..9ab8849 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -104,7 +104,7 @@ static ssize_t zero_pages_show(struct device *dev,
{
struct zram *zram = dev_to_zram(dev);

- return sprintf(buf, "%u\n", zram->stats.pages_zero);
+ return sprintf(buf, "%u\n", atomic_read(&zram->stats.pages_zero));
}

static ssize_t orig_data_size_show(struct device *dev,
@@ -113,7 +113,7 @@ static ssize_t orig_data_size_show(struct device *dev,
struct zram *zram = dev_to_zram(dev);

return sprintf(buf, "%llu\n",
- (u64)(zram->stats.pages_stored) << PAGE_SHIFT);
+ (u64)(atomic_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
}

static ssize_t compr_data_size_show(struct device *dev,
@@ -293,21 +293,21 @@ static void zram_free_page(struct zram *zram, size_t index)
*/
if (zram_test_flag(meta, index, ZRAM_ZERO)) {
zram_clear_flag(meta, index, ZRAM_ZERO);
- zram->stats.pages_zero--;
+ atomic_dec(&zram->stats.pages_zero);
}
return;
}

if (unlikely(size > max_zpage_size))
- zram->stats.bad_compress--;
+ atomic_dec(&zram->stats.bad_compress);

zs_free(meta->mem_pool, handle);

if (size <= PAGE_SIZE / 2)
- zram->stats.good_compress--;
+ atomic_dec(&zram->stats.good_compress);

atomic64_sub(meta->table[index].size, &zram->stats.compr_size);
- zram->stats.pages_stored--;
+ atomic_dec(&zram->stats.pages_stored);

meta->table[index].handle = 0;
meta->table[index].size = 0;
@@ -435,7 +435,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
/* Free memory associated with this sector now. */
zram_free_page(zram, index);

- zram->stats.pages_zero++;
+ atomic_inc(&zram->stats.pages_zero);
zram_set_flag(meta, index, ZRAM_ZERO);
ret = 0;
goto out;
@@ -456,7 +456,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
}

if (unlikely(clen > max_zpage_size)) {
- zram->stats.bad_compress++;
+ atomic_inc(&zram->stats.bad_compress);
clen = PAGE_SIZE;
src = NULL;
if (is_partial_io(bvec))
@@ -493,9 +493,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

/* Update stats */
atomic64_add(clen, &zram->stats.compr_size);
- zram->stats.pages_stored++;
+ atomic_inc(&zram->stats.pages_stored);
if (clen <= PAGE_SIZE / 2)
- zram->stats.good_compress++;
+ atomic_inc(&zram->stats.good_compress);

out:
if (is_partial_io(bvec))
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 0e46953..81b0170 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -68,10 +68,6 @@ struct table {
u8 flags;
} __aligned(4);

-/*
- * All 64bit fields should only be manipulated by 64bit atomic accessors.
- * All modifications to 32bit counter should be protected by zram->lock.
- */
struct zram_stats {
atomic64_t compr_size; /* compressed size of pages stored */
atomic64_t num_reads; /* failed + successful */
@@ -80,10 +76,10 @@ struct zram_stats {
atomic64_t failed_writes; /* can happen when memory is too low */
atomic64_t invalid_io; /* non-page-aligned I/O requests */
atomic64_t notify_free; /* no. of swap slot free notifications */
- u32 pages_zero; /* no. of zero filled pages */
- u32 pages_stored; /* no. of pages currently stored */
- u32 good_compress; /* % of pages with compression ratio<=50% */
- u32 bad_compress; /* % of pages with compression ratio>=75% */
+ atomic_t pages_zero; /* no. of zero filled pages */
+ atomic_t pages_stored; /* no. of pages currently stored */
+ atomic_t good_compress; /* % of pages with compression ratio<=50% */
+ atomic_t bad_compress; /* % of pages with compression ratio>=75% */
};

struct zram_meta {
@@ -101,8 +97,8 @@ struct zram_slot_free {
struct zram {
struct zram_meta *meta;
struct rw_semaphore lock; /* protect compression buffers, table,
- * 32bit stat counters against concurrent
- * notifications, reads and writes */
+ * reads and writes
+ */

struct work_struct free_work; /* handle pending free request */
struct zram_slot_free *slot_free_rq; /* list head of free request */
--
1.8.4.3

2014-01-13 11:21:33

by Minchan Kim

[permalink] [raw]

Subject: [PATCH 2/7] zram: delay pending free request in read path

Sergey Senozhatsky reporetd we don't need to handle pending free
request every I/O so that this patch removes it in read path while
we remain it in write path.

Let's consider below example.

Swap subsystem ask to zram "A" block free by swap_slot_free_notify
but zram had been pended it without real freeing.
Swap subsystem allocates "A" block for new data but request pended
for a long time just handled and zram blindly free new data on
the "A" block. :(

That's why we couldn't remove handle pending free request right before
zram-write.

Signed-off-by: Minchan Kim <[email protected]>
---
drivers/block/zram/zram_drv.c | 1 -
1 file changed, 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 213dfc1..d87c7e9 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -535,7 +535,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,

if (rw == READ) {
down_read(&zram->lock);
- handle_pending_slot_free(zram);
ret = zram_bvec_read(zram, bvec, index, offset, bio);
up_read(&zram->lock);
} else {
--
1.8.4.3

2014-01-13 19:45:54

by Sergey Senozhatsky

[permalink] [raw]

Subject: Re: [PATCH 6/7] zram: remove workqueue for freeing removed pending slot

On (01/13/14 20:19), Minchan Kim wrote:
> [1] introduced free request pending code to avoid scheduling
> by mutex under spinlock and it was a mess which made code
> lenghty and increased overhead.
>
> Now, we don't need zram->lock any more to free slot so
> this patch reverts it and then, tb_lock should protect it.
>
> [1] a0c516c, zram: don't grab mutex in zram_slot_free_noity
> Signed-off-by: Minchan Kim <[email protected]>
> ---
> drivers/block/zram/zram_drv.c | 54 +++++--------------------------------------
> drivers/block/zram/zram_drv.h | 10 --------
> 2 files changed, 6 insertions(+), 58 deletions(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 24e6426..f1a3c95 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -522,20 +522,6 @@ out:
> return ret;
> }
>
> -static void handle_pending_slot_free(struct zram *zram)
> -{
> - struct zram_slot_free *free_rq;
> -
> - spin_lock(&zram->slot_free_lock);
> - while (zram->slot_free_rq) {
> - free_rq = zram->slot_free_rq;
> - zram->slot_free_rq = free_rq->next;
> - zram_free_page(zram, free_rq->index);
> - kfree(free_rq);
> - }
> - spin_unlock(&zram->slot_free_lock);
> -}
> -
> static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
> int offset, struct bio *bio, int rw)
> {
> @@ -547,7 +533,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
> up_read(&zram->lock);
> } else {
> down_write(&zram->lock);
> - handle_pending_slot_free(zram);
> ret = zram_bvec_write(zram, bvec, index, offset);
> up_write(&zram->lock);
> }
> @@ -566,8 +551,6 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> return;
> }
>
> - flush_work(&zram->free_work);
> -
> meta = zram->meta;
> zram->init_done = 0;
>
> @@ -769,40 +752,19 @@ error:
> bio_io_error(bio);
> }
>
> -static void zram_slot_free(struct work_struct *work)
> -{
> - struct zram *zram;
> -
> - zram = container_of(work, struct zram, free_work);
> - down_write(&zram->lock);
> - handle_pending_slot_free(zram);
> - up_write(&zram->lock);
> -}
> -
> -static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq)
> -{
> - spin_lock(&zram->slot_free_lock);
> - free_rq->next = zram->slot_free_rq;
> - zram->slot_free_rq = free_rq;
> - spin_unlock(&zram->slot_free_lock);
> -}
> -
> static void zram_slot_free_notify(struct block_device *bdev,
> unsigned long index)
> {
> struct zram *zram;
> - struct zram_slot_free *free_rq;
> + struct zram_meta *meta;
>
> zram = bdev->bd_disk->private_data;
> - atomic64_inc(&zram->stats.notify_free);
> -
> - free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC);
> - if (!free_rq)
> - return;
> + meta = zram->meta;
>
> - free_rq->index = index;
> - add_slot_free(zram, free_rq);
> - schedule_work(&zram->free_work);
> + write_lock(&meta->tb_lock);
> + zram_free_page(zram, index);
> + write_unlock(&meta->tb_lock);
> + atomic64_inc(&zram->stats.notify_free);
> }
>

Hello Minchan,
I think we need to down_write init_lock in zram_slot_free_notify(),
and thus can avoid locking meta->tb_lock. otherwise, I think,
there is a chance that zram_slot_free_notify() can race with
device reset, e.g.

zram_slot_free_notify() zram_reset_device()
down_write(&zram->init_lock);
meta = zram->meta
zram_meta_free(zram->meta);
zram->meta = NULL;
write_lock(&meta->tb_lock);
[...]
write_unlock(&meta->tb_lock);
[..]
up_write(&zram->init_lock);

-ss

> static const struct block_device_operations zram_devops = {
> @@ -849,10 +811,6 @@ static int create_device(struct zram *zram, int device_id)
> init_rwsem(&zram->lock);
> init_rwsem(&zram->init_lock);
>
> - INIT_WORK(&zram->free_work, zram_slot_free);
> - spin_lock_init(&zram->slot_free_lock);
> - zram->slot_free_rq = NULL;
> -
> zram->queue = blk_alloc_queue(GFP_KERNEL);
> if (!zram->queue) {
> pr_err("Error allocating disk queue for device %d\n",
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index c3f453f..d876300 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -90,20 +90,11 @@ struct zram_meta {
> struct zs_pool *mem_pool;
> };
>
> -struct zram_slot_free {
> - unsigned long index;
> - struct zram_slot_free *next;
> -};
> -
> struct zram {
> struct zram_meta *meta;
> struct rw_semaphore lock; /* protect compression buffers,
> * reads and writes
> */
> -
> - struct work_struct free_work; /* handle pending free request */
> - struct zram_slot_free *slot_free_rq; /* list head of free request */
> -
> struct request_queue *queue;
> struct gendisk *disk;
> int init_done;
> @@ -114,7 +105,6 @@ struct zram {
> * we can store in a disk.
> */
> u64 disksize; /* bytes */
> - spinlock_t slot_free_lock;
>
> struct zram_stats stats;
> };
> --
> 1.8.4.3
>

2014-01-13 23:38:08

by Minchan Kim

[permalink] [raw]

Subject: Re: [PATCH 6/7] zram: remove workqueue for freeing removed pending slot

Hello Sergey,

On Mon, Jan 13, 2014 at 10:42:56PM +0300, Sergey Senozhatsky wrote:
> On (01/13/14 20:19), Minchan Kim wrote:
> > [1] introduced free request pending code to avoid scheduling
> > by mutex under spinlock and it was a mess which made code
> > lenghty and increased overhead.
> >
> > Now, we don't need zram->lock any more to free slot so
> > this patch reverts it and then, tb_lock should protect it.
> >
> > [1] a0c516c, zram: don't grab mutex in zram_slot_free_noity
> > Signed-off-by: Minchan Kim <[email protected]>
> > ---
> > drivers/block/zram/zram_drv.c | 54 +++++--------------------------------------
> > drivers/block/zram/zram_drv.h | 10 --------
> > 2 files changed, 6 insertions(+), 58 deletions(-)
> >
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index 24e6426..f1a3c95 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -522,20 +522,6 @@ out:
> > return ret;
> > }
> >
> > -static void handle_pending_slot_free(struct zram *zram)
> > -{
> > - struct zram_slot_free *free_rq;
> > -
> > - spin_lock(&zram->slot_free_lock);
> > - while (zram->slot_free_rq) {
> > - free_rq = zram->slot_free_rq;
> > - zram->slot_free_rq = free_rq->next;
> > - zram_free_page(zram, free_rq->index);
> > - kfree(free_rq);
> > - }
> > - spin_unlock(&zram->slot_free_lock);
> > -}
> > -
> > static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
> > int offset, struct bio *bio, int rw)
> > {
> > @@ -547,7 +533,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
> > up_read(&zram->lock);
> > } else {
> > down_write(&zram->lock);
> > - handle_pending_slot_free(zram);
> > ret = zram_bvec_write(zram, bvec, index, offset);
> > up_write(&zram->lock);
> > }
> > @@ -566,8 +551,6 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > return;
> > }
> >
> > - flush_work(&zram->free_work);
> > -
> > meta = zram->meta;
> > zram->init_done = 0;
> >
> > @@ -769,40 +752,19 @@ error:
> > bio_io_error(bio);
> > }
> >
> > -static void zram_slot_free(struct work_struct *work)
> > -{
> > - struct zram *zram;
> > -
> > - zram = container_of(work, struct zram, free_work);
> > - down_write(&zram->lock);
> > - handle_pending_slot_free(zram);
> > - up_write(&zram->lock);
> > -}
> > -
> > -static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq)
> > -{
> > - spin_lock(&zram->slot_free_lock);
> > - free_rq->next = zram->slot_free_rq;
> > - zram->slot_free_rq = free_rq;
> > - spin_unlock(&zram->slot_free_lock);
> > -}
> > -
> > static void zram_slot_free_notify(struct block_device *bdev,
> > unsigned long index)
> > {
> > struct zram *zram;
> > - struct zram_slot_free *free_rq;
> > + struct zram_meta *meta;
> >
> > zram = bdev->bd_disk->private_data;
> > - atomic64_inc(&zram->stats.notify_free);
> > -
> > - free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC);
> > - if (!free_rq)
> > - return;
> > + meta = zram->meta;
> >
> > - free_rq->index = index;
> > - add_slot_free(zram, free_rq);
> > - schedule_work(&zram->free_work);
> > + write_lock(&meta->tb_lock);
> > + zram_free_page(zram, index);
> > + write_unlock(&meta->tb_lock);
> > + atomic64_inc(&zram->stats.notify_free);
> > }
> >
>
> Hello Minchan,
> I think we need to down_write init_lock in zram_slot_free_notify(),
> and thus can avoid locking meta->tb_lock. otherwise, I think,

zram_slot_free_notify is atomic path so we couldn't hold mutex.

> there is a chance that zram_slot_free_notify() can race with
> device reset, e.g.
>
> zram_slot_free_notify() zram_reset_device()
> down_write(&zram->init_lock);
> meta = zram->meta
> zram_meta_free(zram->meta);
> zram->meta = NULL;
> write_lock(&meta->tb_lock);
> [...]
> write_unlock(&meta->tb_lock);
> [..]
> up_write(&zram->init_lock);
>

Nope. We couldn't reset active device by bdev->bd_holders check
logic in reset_store.

> -ss
--
Kind regards,
Minchan Kim

2014-01-13 23:55:33

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH 1/7] zram: fix race between reset and flushing pending work

On Mon, 13 Jan 2014 20:18:56 +0900 Minchan Kim <[email protected]> wrote:

> Dan and Sergey reported that there is a racy between reset and
> flushing of pending work so that it could make oops by freeing
> zram->meta in reset while zram_slot_free can access zram->meta
> if new request is adding during the race window.
>
> This patch moves flush after taking init_lock so it prevents
> new request so that it closes the race.
>
> ..
>
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -553,14 +553,14 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> size_t index;
> struct zram_meta *meta;
>
> - flush_work(&zram->free_work);
> -
> down_write(&zram->init_lock);
> if (!zram->init_done) {
> up_write(&zram->init_lock);
> return;
> }
>
> + flush_work(&zram->free_work);
> +
> meta = zram->meta;
> zram->init_done = 0;

This makes zram.lock nest inside zram.init_lock, which afaict is new
behaviour.

That all seems OK and logical - has it been well tested with lockdep?

2014-01-13 23:58:17

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH 4/7] zram: use atomic operation for stat

On Mon, 13 Jan 2014 20:18:59 +0900 Minchan Kim <[email protected]> wrote:

> Some of fields in zram->stats are protected by zram->lock which
> is rather coarse-grained so let's use atomic operation without
> explict locking.

Confused. The patch didn't remove any locking, so it made the code
slower.

2014-01-14 00:15:18

by Minchan Kim

[permalink] [raw]

Subject: Re: [PATCH 1/7] zram: fix race between reset and flushing pending work

On Mon, Jan 13, 2014 at 03:55:27PM -0800, Andrew Morton wrote:
> On Mon, 13 Jan 2014 20:18:56 +0900 Minchan Kim <[email protected]> wrote:
>
> > Dan and Sergey reported that there is a racy between reset and
> > flushing of pending work so that it could make oops by freeing
> > zram->meta in reset while zram_slot_free can access zram->meta
> > if new request is adding during the race window.
> >
> > This patch moves flush after taking init_lock so it prevents
> > new request so that it closes the race.
> >
> > ..
> >
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -553,14 +553,14 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > size_t index;
> > struct zram_meta *meta;
> >
> > - flush_work(&zram->free_work);
> > -
> > down_write(&zram->init_lock);
> > if (!zram->init_done) {
> > up_write(&zram->init_lock);
> > return;
> > }
> >
> > + flush_work(&zram->free_work);
> > +
> > meta = zram->meta;
> > zram->init_done = 0;
>
> This makes zram.lock nest inside zram.init_lock, which afaict is new
> behaviour.

Originally, it was nested so it's not new. :)
Look at zram_make_request which hold init_lock and then zram_bvec_rw
hold zram->lock.

>
> That all seems OK and logical - has it been well tested with lockdep?

Yeb.

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Kind regards,
Minchan Kim

2014-01-14 00:18:57

by Minchan Kim

[permalink] [raw]

Subject: Re: [PATCH 4/7] zram: use atomic operation for stat

On Mon, Jan 13, 2014 at 03:58:14PM -0800, Andrew Morton wrote:
> On Mon, 13 Jan 2014 20:18:59 +0900 Minchan Kim <[email protected]> wrote:
>
> > Some of fields in zram->stats are protected by zram->lock which
> > is rather coarse-grained so let's use atomic operation without
> > explict locking.
>
> Confused. The patch didn't remove any locking, so it made the code
> slower.

True but it could make remove dependency of zram->lock for 32bit stat
so further patches can remove messy code and enhance write performance.
So, it's preparing patch for further step.
Should I rewrite the description to explain this?

>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Kind regards,
Minchan Kim

2014-01-14 00:23:31

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH 4/7] zram: use atomic operation for stat

On Tue, 14 Jan 2014 09:19:35 +0900 Minchan Kim <[email protected]> wrote:

> On Mon, Jan 13, 2014 at 03:58:14PM -0800, Andrew Morton wrote:
> > On Mon, 13 Jan 2014 20:18:59 +0900 Minchan Kim <[email protected]> wrote:
> >
> > > Some of fields in zram->stats are protected by zram->lock which
> > > is rather coarse-grained so let's use atomic operation without
> > > explict locking.
> >
> > Confused. The patch didn't remove any locking, so it made the code
> > slower.
>
> True but it could make remove dependency of zram->lock for 32bit stat
> so further patches can remove messy code and enhance write performance.
> So, it's preparing patch for further step.
> Should I rewrite the description to explain this?

That would be useful ;) I'd ask for performance testing results but I
expect they'll say "no difference".

I grabbed patches 1-3 as they seems appropriate for 3.14.

2014-01-14 00:38:18

by Minchan Kim

[permalink] [raw]

Subject: Re: [PATCH 4/7] zram: use atomic operation for stat

Hello Andrew,

On Mon, Jan 13, 2014 at 04:23:25PM -0800, Andrew Morton wrote:
> On Tue, 14 Jan 2014 09:19:35 +0900 Minchan Kim <[email protected]> wrote:
>
> > On Mon, Jan 13, 2014 at 03:58:14PM -0800, Andrew Morton wrote:
> > > On Mon, 13 Jan 2014 20:18:59 +0900 Minchan Kim <[email protected]> wrote:
> > >
> > > > Some of fields in zram->stats are protected by zram->lock which
> > > > is rather coarse-grained so let's use atomic operation without
> > > > explict locking.
> > >
> > > Confused. The patch didn't remove any locking, so it made the code
> > > slower.
> >
> > True but it could make remove dependency of zram->lock for 32bit stat
> > so further patches can remove messy code and enhance write performance.
> > So, it's preparing patch for further step.
> > Should I rewrite the description to explain this?
>
> That would be useful ;) I'd ask for performance testing results but I
> expect they'll say "no difference".
>
> I grabbed patches 1-3 as they seems appropriate for 3.14.

I will resend it tomorrow with testing result.
Thanks!

>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Kind regards,
Minchan Kim

2014-01-14 07:12:05

by Sergey Senozhatsky

[permalink] [raw]

Subject: Re: [PATCH 6/7] zram: remove workqueue for freeing removed pending slot

Hello Minchan,

On (01/14/14 08:38), Minchan Kim wrote:
> Hello Sergey,
>
> On Mon, Jan 13, 2014 at 10:42:56PM +0300, Sergey Senozhatsky wrote:
> > On (01/13/14 20:19), Minchan Kim wrote:
> >
> > Hello Minchan,
> > I think we need to down_write init_lock in zram_slot_free_notify(),
> > and thus can avoid locking meta->tb_lock. otherwise, I think,
>
> zram_slot_free_notify is atomic path so we couldn't hold mutex.
>
> > there is a chance that zram_slot_free_notify() can race with
> > device reset, e.g.
> >
> > zram_slot_free_notify() zram_reset_device()
> > down_write(&zram->init_lock);
> > meta = zram->meta
> > zram_meta_free(zram->meta);
> > zram->meta = NULL;
> > write_lock(&meta->tb_lock);
> > [...]
> > write_unlock(&meta->tb_lock);
> > [..]
> > up_write(&zram->init_lock);
> >
>
> Nope. We couldn't reset active device by bdev->bd_holders check
> logic in reset_store.
>

true. sorry for the noise.

-ss

2014-01-14 07:17:29

by Sergey Senozhatsky

[permalink] [raw]

Subject: Re: [PATCH 1/7] zram: fix race between reset and flushing pending work

On (01/13/14 15:55), Andrew Morton wrote:
[..]
> >
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -553,14 +553,14 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity)
> > size_t index;
> > struct zram_meta *meta;
> >
> > - flush_work(&zram->free_work);
> > -
> > down_write(&zram->init_lock);
> > if (!zram->init_done) {
> > up_write(&zram->init_lock);
> > return;
> > }
> >
> > + flush_work(&zram->free_work);
> > +
> > meta = zram->meta;
> > zram->init_done = 0;
>
> This makes zram.lock nest inside zram.init_lock, which afaict is new
> behaviour.
>
> That all seems OK and logical - has it been well tested with lockdep?
>

Patches 1-7
Tested-by: Sergey Senozhatsky <[email protected]>

-ss

2014-01-14 09:30:40

by Jerome Marchand

[permalink] [raw]

Subject: Re: [PATCH 7/7] zram: remove unnecessary lock

On 01/13/2014 12:19 PM, Minchan Kim wrote:
> read/write lock's performance is really bad compared to
> mutex_lock in write most workload.(AFAIR, recenlty there
> were some effort to enhance it but not sure it got merged).
>
> Anyway, we don't need such big granuarity read-write lock
> any more so this patch replaces read/write lock with mutex.

I find your description misleading. You seem to imply that
the r/w semaphore is inappropriate here and that replacing
it by a mutex increased performance while in fact (correct
me if I'm wrong) you replaced the rw semaphore by another
rw semaphore, a mutex and atomic operations. It seems to me
that the perf enhancement come from the smaller grain, not
an rw lock perf issue.
Also, please add a general description of the locking
changes you did. As Andrew, I was also confused at first by
your fourth patch.

Jerome

>
> CPU 12
> iozone -t -T -l 12 -u 12 -r 16K -s 60M -I +Z -V 0
>
> ==Initial write ==Initial write
> records: 10 records: 10
> avg: 516189.16 avg: 839907.96
> std: 22486.53 std: 47902.17
> max: 546970.60 max: 909910.35
> min: 481131.54 min: 751148.38
> ==Rewrite ==Rewrite
> records: 10 records: 10
> avg: 509527.98 avg: 1050156.37
> std: 45799.94 std: 40695.44
> max: 611574.27 max: 1111929.26
> min: 443679.95 min: 980409.62
> ==Read ==Read
> records: 10 records: 10
> avg: 4408624.17 avg: 4472546.76
> std: 281152.61 std: 163662.78
> max: 4867888.66 max: 4727351.03
> min: 4058347.69 min: 4126520.88
> ==Re-read ==Re-read
> records: 10 records: 10
> avg: 4462147.53 avg: 4363257.75
> std: 283546.11 std: 247292.63
> max: 4912894.44 max: 4677241.75
> min: 4131386.50 min: 4035235.84
> ==Reverse Read ==Reverse Read
> records: 10 records: 10
> avg: 4565865.97 avg: 4485818.08
> std: 313395.63 std: 248470.10
> max: 5232749.16 max: 4789749.94
> min: 4185809.62 min: 3963081.34
> ==Stride read ==Stride read
> records: 10 records: 10
> avg: 4515981.80 avg: 4418806.01
> std: 211192.32 std: 212837.97
> max: 4889287.28 max: 4686967.22
> min: 4210362.00 min: 4083041.84
> ==Random read ==Random read
> records: 10 records: 10
> avg: 4410525.23 avg: 4387093.18
> std: 236693.22 std: 235285.23
> max: 4713698.47 max: 4669760.62
> min: 4057163.62 min: 3952002.16
> ==Mixed workload ==Mixed workload
> records: 10 records: 10
> avg: 243234.25 avg: 2818677.27
> std: 28505.07 std: 195569.70
> max: 288905.23 max: 3126478.11
> min: 212473.16 min: 2484150.69
> ==Random write ==Random write
> records: 10 records: 10
> avg: 555887.07 avg: 1053057.79
> std: 70841.98 std: 35195.36
> max: 683188.28 max: 1096125.73
> min: 437299.57 min: 992481.93
> ==Pwrite ==Pwrite
> records: 10 records: 10
> avg: 501745.93 avg: 810363.09
> std: 16373.54 std: 19245.01
> max: 518724.52 max: 833359.70
> min: 464208.73 min: 765501.87
> ==Pread ==Pread
> records: 10 records: 10
> avg: 4539894.60 avg: 4457680.58
> std: 197094.66 std: 188965.60
> max: 4877170.38 max: 4689905.53
> min: 4226326.03 min: 4095739.72
>
> Read side seem to be a bit slower than old but I believe it's not
> bad deal if we consider increased performance of write side and
> code readability.
>
> Signed-off-by: Minchan Kim <[email protected]>
> ---
> drivers/block/zram/zram_drv.c | 17 ++++++++---------
> drivers/block/zram/zram_drv.h | 4 +---
> 2 files changed, 9 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index f1a3c95..011e55d 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -230,6 +230,7 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
> }
>
> rwlock_init(&meta->tb_lock);
> + mutex_init(&meta->buffer_lock);
> return meta;
>
> free_table:
> @@ -412,6 +413,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
> struct page *page;
> unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
> struct zram_meta *meta = zram->meta;
> + bool locked = false;
>
> page = bvec->bv_page;
> src = meta->compress_buffer;
> @@ -431,6 +433,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
> goto out;
> }
>
> + mutex_lock(&meta->buffer_lock);
> + locked = true;
> user_mem = kmap_atomic(page);
>
> if (is_partial_io(bvec)) {
> @@ -457,7 +461,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
>
> ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen,
> meta->compress_workmem);
> -
> if (!is_partial_io(bvec)) {
> kunmap_atomic(user_mem);
> user_mem = NULL;
> @@ -514,6 +517,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
> atomic_inc(&zram->stats.good_compress);
>
> out:
> + if (locked)
> + mutex_unlock(&meta->buffer_lock);
> if (is_partial_io(bvec))
> kfree(uncmem);
>
> @@ -527,15 +532,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
> {
> int ret;
>
> - if (rw == READ) {
> - down_read(&zram->lock);
> + if (rw == READ)
> ret = zram_bvec_read(zram, bvec, index, offset, bio);
> - up_read(&zram->lock);
> - } else {
> - down_write(&zram->lock);
> + else
> ret = zram_bvec_write(zram, bvec, index, offset);
> - up_write(&zram->lock);
> - }
>
> return ret;
> }
> @@ -808,7 +808,6 @@ static int create_device(struct zram *zram, int device_id)
> {
> int ret = -ENOMEM;
>
> - init_rwsem(&zram->lock);
> init_rwsem(&zram->init_lock);
>
> zram->queue = blk_alloc_queue(GFP_KERNEL);
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index d876300..ad8aa35 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -88,13 +88,11 @@ struct zram_meta {
> void *compress_buffer;
> struct table *table;
> struct zs_pool *mem_pool;
> + struct mutex buffer_lock; /* protect compress buffers */
> };
>
> struct zram {
> struct zram_meta *meta;
> - struct rw_semaphore lock; /* protect compression buffers,
> - * reads and writes
> - */
> struct request_queue *queue;
> struct gendisk *disk;
> int init_done;
>

2014-01-15 01:33:18

by Minchan Kim

[permalink] [raw]

Subject: Re: [PATCH 7/7] zram: remove unnecessary lock

Hello Jerome,

On Tue, Jan 14, 2014 at 10:29:40AM +0100, Jerome Marchand wrote:
> On 01/13/2014 12:19 PM, Minchan Kim wrote:
> > read/write lock's performance is really bad compared to
> > mutex_lock in write most workload.(AFAIR, recenlty there
> > were some effort to enhance it but not sure it got merged).
> >
> > Anyway, we don't need such big granuarity read-write lock
> > any more so this patch replaces read/write lock with mutex.
>
> I find your description misleading. You seem to imply that
> the r/w semaphore is inappropriate here and that replacing
> it by a mutex increased performance while in fact (correct
> me if I'm wrong) you replaced the rw semaphore by another
> rw semaphore, a mutex and atomic operations. It seems to me
> that the perf enhancement come from the smaller grain, not
> an rw lock perf issue.
> Also, please add a general description of the locking
> changes you did. As Andrew, I was also confused at first by
> your fourth patch.

Thanks for the review.
I just sent v2 with updated description.
Please review it.

In summary,
I removed read-write critical section by rw-semaphore so IOzone's
mixed workload perform well than old and as a bonus point, I
changed rw-semaphore with mutex so that we get a bonus point
on write-write concurrency since mutex supports SPIN_ON_OWNER
while rw-semaphore doesn't yet.

Thanks.

>
> Jerome
>
> >
> > CPU 12
> > iozone -t -T -l 12 -u 12 -r 16K -s 60M -I +Z -V 0
> >
> > ==Initial write ==Initial write
> > records: 10 records: 10
> > avg: 516189.16 avg: 839907.96
> > std: 22486.53 std: 47902.17
> > max: 546970.60 max: 909910.35
> > min: 481131.54 min: 751148.38
> > ==Rewrite ==Rewrite
> > records: 10 records: 10
> > avg: 509527.98 avg: 1050156.37
> > std: 45799.94 std: 40695.44
> > max: 611574.27 max: 1111929.26
> > min: 443679.95 min: 980409.62
> > ==Read ==Read
> > records: 10 records: 10
> > avg: 4408624.17 avg: 4472546.76
> > std: 281152.61 std: 163662.78
> > max: 4867888.66 max: 4727351.03
> > min: 4058347.69 min: 4126520.88
> > ==Re-read ==Re-read
> > records: 10 records: 10
> > avg: 4462147.53 avg: 4363257.75
> > std: 283546.11 std: 247292.63
> > max: 4912894.44 max: 4677241.75
> > min: 4131386.50 min: 4035235.84
> > ==Reverse Read ==Reverse Read
> > records: 10 records: 10
> > avg: 4565865.97 avg: 4485818.08
> > std: 313395.63 std: 248470.10
> > max: 5232749.16 max: 4789749.94
> > min: 4185809.62 min: 3963081.34
> > ==Stride read ==Stride read
> > records: 10 records: 10
> > avg: 4515981.80 avg: 4418806.01
> > std: 211192.32 std: 212837.97
> > max: 4889287.28 max: 4686967.22
> > min: 4210362.00 min: 4083041.84
> > ==Random read ==Random read
> > records: 10 records: 10
> > avg: 4410525.23 avg: 4387093.18
> > std: 236693.22 std: 235285.23
> > max: 4713698.47 max: 4669760.62
> > min: 4057163.62 min: 3952002.16
> > ==Mixed workload ==Mixed workload
> > records: 10 records: 10
> > avg: 243234.25 avg: 2818677.27
> > std: 28505.07 std: 195569.70
> > max: 288905.23 max: 3126478.11
> > min: 212473.16 min: 2484150.69
> > ==Random write ==Random write
> > records: 10 records: 10
> > avg: 555887.07 avg: 1053057.79
> > std: 70841.98 std: 35195.36
> > max: 683188.28 max: 1096125.73
> > min: 437299.57 min: 992481.93
> > ==Pwrite ==Pwrite
> > records: 10 records: 10
> > avg: 501745.93 avg: 810363.09
> > std: 16373.54 std: 19245.01
> > max: 518724.52 max: 833359.70
> > min: 464208.73 min: 765501.87
> > ==Pread ==Pread
> > records: 10 records: 10
> > avg: 4539894.60 avg: 4457680.58
> > std: 197094.66 std: 188965.60
> > max: 4877170.38 max: 4689905.53
> > min: 4226326.03 min: 4095739.72
> >
> > Read side seem to be a bit slower than old but I believe it's not
> > bad deal if we consider increased performance of write side and
> > code readability.
> >
> > Signed-off-by: Minchan Kim <[email protected]>
> > ---
> > drivers/block/zram/zram_drv.c | 17 ++++++++---------
> > drivers/block/zram/zram_drv.h | 4 +---
> > 2 files changed, 9 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index f1a3c95..011e55d 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -230,6 +230,7 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
> > }
> >
> > rwlock_init(&meta->tb_lock);
> > + mutex_init(&meta->buffer_lock);
> > return meta;
> >
> > free_table:
> > @@ -412,6 +413,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
> > struct page *page;
> > unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
> > struct zram_meta *meta = zram->meta;
> > + bool locked = false;
> >
> > page = bvec->bv_page;
> > src = meta->compress_buffer;
> > @@ -431,6 +433,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
> > goto out;
> > }
> >
> > + mutex_lock(&meta->buffer_lock);
> > + locked = true;
> > user_mem = kmap_atomic(page);
> >
> > if (is_partial_io(bvec)) {
> > @@ -457,7 +461,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
> >
> > ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen,
> > meta->compress_workmem);
> > -
> > if (!is_partial_io(bvec)) {
> > kunmap_atomic(user_mem);
> > user_mem = NULL;
> > @@ -514,6 +517,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
> > atomic_inc(&zram->stats.good_compress);
> >
> > out:
> > + if (locked)
> > + mutex_unlock(&meta->buffer_lock);
> > if (is_partial_io(bvec))
> > kfree(uncmem);
> >
> > @@ -527,15 +532,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
> > {
> > int ret;
> >
> > - if (rw == READ) {
> > - down_read(&zram->lock);
> > + if (rw == READ)
> > ret = zram_bvec_read(zram, bvec, index, offset, bio);
> > - up_read(&zram->lock);
> > - } else {
> > - down_write(&zram->lock);
> > + else
> > ret = zram_bvec_write(zram, bvec, index, offset);
> > - up_write(&zram->lock);
> > - }
> >
> > return ret;
> > }
> > @@ -808,7 +808,6 @@ static int create_device(struct zram *zram, int device_id)
> > {
> > int ret = -ENOMEM;
> >
> > - init_rwsem(&zram->lock);
> > init_rwsem(&zram->init_lock);
> >
> > zram->queue = blk_alloc_queue(GFP_KERNEL);
> > diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> > index d876300..ad8aa35 100644
> > --- a/drivers/block/zram/zram_drv.h
> > +++ b/drivers/block/zram/zram_drv.h
> > @@ -88,13 +88,11 @@ struct zram_meta {
> > void *compress_buffer;
> > struct table *table;
> > struct zs_pool *mem_pool;
> > + struct mutex buffer_lock; /* protect compress buffers */
> > };
> >
> > struct zram {
> > struct zram_meta *meta;
> > - struct rw_semaphore lock; /* protect compression buffers,
> > - * reads and writes
> > - */
> > struct request_queue *queue;
> > struct gendisk *disk;
> > int init_done;
> >
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Kind regards,
Minchan Kim