2012-11-27 07:26:50

by Nitin Gupta

[permalink] [raw]
Subject: [PATCH 1/2] zsmalloc: add function to query object size

Adds zs_get_object_size(handle) which provides the size of
the given object. This is useful since the user (zram etc.)
now do not have to maintain object sizes separately, saving
on some metadata size (4b per page).

The object handle encodes <page, offset> pair which currently points
to the start of the object. Now, the handle implicitly stores the size
information by pointing to the object's end instead. Since zsmalloc is
a slab based allocator, the start of the object can be easily determined
and the difference between the end offset encoded in the handle and the
start gives us the object size.

Signed-off-by: Nitin Gupta <[email protected]>
---
drivers/staging/zsmalloc/zsmalloc-main.c | 177 +++++++++++++++++++++---------
drivers/staging/zsmalloc/zsmalloc.h | 1 +
2 files changed, 127 insertions(+), 51 deletions(-)

diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c
index 09a9d35..65c9d3b 100644
--- a/drivers/staging/zsmalloc/zsmalloc-main.c
+++ b/drivers/staging/zsmalloc/zsmalloc-main.c
@@ -112,20 +112,20 @@
#define MAX_PHYSMEM_BITS 36
#else /* !CONFIG_HIGHMEM64G */
/*
- * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
+ * If this definition of MAX_PHYSMEM_BITS is used, OFFSET_BITS will just
* be PAGE_SHIFT
*/
#define MAX_PHYSMEM_BITS BITS_PER_LONG
#endif
#endif
#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
-#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
+#define OFFSET_BITS (BITS_PER_LONG - _PFN_BITS)
+#define OFFSET_MASK ((_AC(1, UL) << OFFSET_BITS) - 1)

#define MAX(a, b) ((a) >= (b) ? (a) : (b))
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
#define ZS_MIN_ALLOC_SIZE \
- MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OFFSET_BITS))
#define ZS_MAX_ALLOC_SIZE PAGE_SIZE

/*
@@ -256,6 +256,11 @@ static int is_last_page(struct page *page)
return PagePrivate2(page);
}

+static unsigned long get_page_index(struct page *page)
+{
+ return is_first_page(page) ? 0 : page->index;
+}
+
static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
enum fullness_group *fullness)
{
@@ -433,39 +438,86 @@ static struct page *get_next_page(struct page *page)
return next;
}

-/* Encode <page, obj_idx> as a single handle value */
-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+static struct page *get_prev_page(struct page *page)
{
- unsigned long handle;
+ struct page *prev, *first_page;

- if (!page) {
- BUG_ON(obj_idx);
- return NULL;
- }
+ first_page = get_first_page(page);
+ if (page == first_page)
+ prev = NULL;
+ else if (page == (struct page *)first_page->private)
+ prev = first_page;
+ else
+ prev = list_entry(page->lru.prev, struct page, lru);

- handle = page_to_pfn(page) << OBJ_INDEX_BITS;
- handle |= (obj_idx & OBJ_INDEX_MASK);
+ return prev;

- return (void *)handle;
}

-/* Decode <page, obj_idx> pair from the given object handle */
-static void obj_handle_to_location(unsigned long handle, struct page **page,
- unsigned long *obj_idx)
+static void *encode_ptr(struct page *page, unsigned long offset)
{
- *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
- *obj_idx = handle & OBJ_INDEX_MASK;
+ unsigned long ptr;
+ ptr = page_to_pfn(page) << OFFSET_BITS;
+ ptr |= offset & OFFSET_MASK;
+ return (void *)ptr;
+}
+
+static void decode_ptr(unsigned long ptr, struct page **page,
+ unsigned int *offset)
+{
+ *page = pfn_to_page(ptr >> OFFSET_BITS);
+ *offset = ptr & OFFSET_MASK;
+}
+
+static struct page *obj_handle_to_page(unsigned long handle)
+{
+ struct page *page;
+ unsigned int offset;
+
+ decode_ptr(handle, &page, &offset);
+ if (offset < get_page_index(page))
+ page = get_prev_page(page);
+
+ return page;
+}
+
+static unsigned int obj_handle_to_offset(unsigned long handle,
+ unsigned int class_size)
+{
+ struct page *page;
+ unsigned int offset;
+
+ decode_ptr(handle, &page, &offset);
+ if (offset < get_page_index(page))
+ offset = PAGE_SIZE - class_size + get_page_index(page);
+ else
+ offset = roundup(offset, class_size) - class_size;
+
+ return offset;
}

-static unsigned long obj_idx_to_offset(struct page *page,
- unsigned long obj_idx, int class_size)
+/* Encode <page, offset, size> as a single handle value */
+static void *obj_location_to_handle(struct page *page, unsigned int offset,
+ unsigned int size, unsigned int class_size)
{
- unsigned long off = 0;
+ struct page *endpage;
+ unsigned int endoffset;

- if (!is_first_page(page))
- off = page->index;
+ if (!page) {
+ BUG_ON(offset);
+ return NULL;
+ }
+ BUG_ON(offset >= PAGE_SIZE);
+
+ endpage = page;
+ endoffset = offset + size - 1;
+ if (endoffset >= PAGE_SIZE) {
+ endpage = get_next_page(page);
+ BUG_ON(!endpage);
+ endoffset -= PAGE_SIZE;
+ }

- return off + obj_idx * class_size;
+ return encode_ptr(endpage, endoffset);
}

static void reset_page(struct page *page)
@@ -506,14 +558,13 @@ static void free_zspage(struct page *first_page)
/* Initialize a newly allocated zspage */
static void init_zspage(struct page *first_page, struct size_class *class)
{
- unsigned long off = 0;
+ unsigned long off = 0, next_off = 0;
struct page *page = first_page;

BUG_ON(!is_first_page(first_page));
while (page) {
struct page *next_page;
struct link_free *link;
- unsigned int i, objs_on_page;

/*
* page->index stores offset of first object starting
@@ -526,14 +577,12 @@ static void init_zspage(struct page *first_page, struct size_class *class)

link = (struct link_free *)kmap_atomic(page) +
off / sizeof(*link);
- objs_on_page = (PAGE_SIZE - off) / class->size;

- for (i = 1; i <= objs_on_page; i++) {
- off += class->size;
- if (off < PAGE_SIZE) {
- link->next = obj_location_to_handle(page, i);
- link += class->size / sizeof(*link);
- }
+ next_off = off + class->size;
+ while (next_off < PAGE_SIZE) {
+ link->next = encode_ptr(page, next_off);
+ link += class->size / sizeof(*link);
+ next_off += class->size;
}

/*
@@ -542,10 +591,11 @@ static void init_zspage(struct page *first_page, struct size_class *class)
* page (if present)
*/
next_page = get_next_page(page);
- link->next = obj_location_to_handle(next_page, 0);
+ next_off = next_page ? next_off - PAGE_SIZE : 0;
+ link->next = encode_ptr(next_page, next_off);
kunmap_atomic(link);
page = next_page;
- off = (off + class->size) % PAGE_SIZE;
+ off = next_off;
}
}

@@ -596,7 +646,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)

init_zspage(first_page, class);

- first_page->freelist = obj_location_to_handle(first_page, 0);
+ first_page->freelist = encode_ptr(first_page, 0);
/* Maximum number of objects we can store in this zspage */
first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;

@@ -871,7 +921,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
struct size_class *class;

struct page *first_page, *m_page;
- unsigned long m_objidx, m_offset;
+ unsigned int m_offset;

if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
@@ -895,8 +945,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
}

obj = (unsigned long)first_page->freelist;
- obj_handle_to_location(obj, &m_page, &m_objidx);
- m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+ decode_ptr(obj, &m_page, &m_offset);

link = (struct link_free *)kmap_atomic(m_page) +
m_offset / sizeof(*link);
@@ -907,6 +956,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
first_page->inuse++;
/* Now move the zspage to another fullness group, if required */
fix_fullness_group(pool, first_page);
+
+ obj = (unsigned long)obj_location_to_handle(m_page, m_offset,
+ size, class->size);
spin_unlock(&class->lock);

return obj;
@@ -917,7 +969,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
{
struct link_free *link;
struct page *first_page, *f_page;
- unsigned long f_objidx, f_offset;
+ unsigned long f_offset;

int class_idx;
struct size_class *class;
@@ -926,12 +978,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
if (unlikely(!obj))
return;

- obj_handle_to_location(obj, &f_page, &f_objidx);
+ f_page = obj_handle_to_page(obj);
first_page = get_first_page(f_page);

get_zspage_mapping(first_page, &class_idx, &fullness);
class = &pool->size_class[class_idx];
- f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
+ f_offset = obj_handle_to_offset(obj, class->size);

spin_lock(&class->lock);

@@ -940,7 +992,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
+ f_offset);
link->next = first_page->freelist;
kunmap_atomic(link);
- first_page->freelist = (void *)obj;
+ first_page->freelist = encode_ptr(f_page, f_offset);

first_page->inuse--;
fullness = fix_fullness_group(pool, first_page);
@@ -970,10 +1022,10 @@ EXPORT_SYMBOL_GPL(zs_free);
* This function returns with preemption and page faults disabled.
*/
void *zs_map_object(struct zs_pool *pool, unsigned long handle,
- enum zs_mapmode mm)
+ enum zs_mapmode mm)
{
struct page *page;
- unsigned long obj_idx, off;
+ unsigned long off;

unsigned int class_idx;
enum fullness_group fg;
@@ -990,10 +1042,10 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
*/
BUG_ON(in_interrupt());

- obj_handle_to_location(handle, &page, &obj_idx);
+ page = obj_handle_to_page(handle);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
class = &pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ off = obj_handle_to_offset(handle, class->size);

area = &get_cpu_var(zs_map_area);
area->vm_mm = mm;
@@ -1015,7 +1067,7 @@ EXPORT_SYMBOL_GPL(zs_map_object);
void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
{
struct page *page;
- unsigned long obj_idx, off;
+ unsigned long off;

unsigned int class_idx;
enum fullness_group fg;
@@ -1024,10 +1076,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)

BUG_ON(!handle);

- obj_handle_to_location(handle, &page, &obj_idx);
+ page = obj_handle_to_page(handle);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
class = &pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ off = obj_handle_to_offset(handle, class->size);

area = &__get_cpu_var(zs_map_area);
if (off + class->size <= PAGE_SIZE)
@@ -1045,6 +1097,29 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_unmap_object);

+size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle)
+{
+ struct page *endpage;
+ unsigned int endoffset, size;
+
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+
+ decode_ptr(handle, &endpage, &endoffset);
+ get_zspage_mapping(endpage, &class_idx, &fg);
+ class = &pool->size_class[class_idx];
+
+ size = endoffset + 1;
+ if (endoffset < get_page_index(endpage))
+ size += class->size - get_page_index(endpage);
+ else
+ size -= rounddown(endoffset, class->size);
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(zs_get_object_size);
+
u64 zs_get_total_size_bytes(struct zs_pool *pool)
{
int i;
diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/drivers/staging/zsmalloc/zsmalloc.h
index de2e8bf..2830fdf 100644
--- a/drivers/staging/zsmalloc/zsmalloc.h
+++ b/drivers/staging/zsmalloc/zsmalloc.h
@@ -38,6 +38,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
enum zs_mapmode mm);
void zs_unmap_object(struct zs_pool *pool, unsigned long handle);

+size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle);
u64 zs_get_total_size_bytes(struct zs_pool *pool);

#endif
--
1.7.10.4


2012-11-27 07:26:58

by Nitin Gupta

[permalink] [raw]
Subject: [PATCH 2/2] zram: reduce metadata overhead

For every allocated object, zram maintains the the handle, size,
flags and count fields. Of these, only the handle is required
since zsmalloc now provides the object size given the handle.
The flags field was needed only to mark a given page as zero-filled.
Instead of this field, we now use an invalid value (-1) to mark such
pages. Lastly, the count field was unused, so was simply removed.

Signed-off-by: Nitin Gupta <[email protected]>
---
drivers/staging/zram/zram_drv.c | 80 ++++++++++++++-------------------------
drivers/staging/zram/zram_drv.h | 18 +--------
2 files changed, 31 insertions(+), 67 deletions(-)

diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c
index f2a73bd..8ff7b67 100644
--- a/drivers/staging/zram/zram_drv.c
+++ b/drivers/staging/zram/zram_drv.c
@@ -71,24 +71,6 @@ static void zram_stat64_inc(struct zram *zram, u64 *v)
zram_stat64_add(zram, v, 1);
}

-static int zram_test_flag(struct zram *zram, u32 index,
- enum zram_pageflags flag)
-{
- return zram->table[index].flags & BIT(flag);
-}
-
-static void zram_set_flag(struct zram *zram, u32 index,
- enum zram_pageflags flag)
-{
- zram->table[index].flags |= BIT(flag);
-}
-
-static void zram_clear_flag(struct zram *zram, u32 index,
- enum zram_pageflags flag)
-{
- zram->table[index].flags &= ~BIT(flag);
-}
-
static int page_zero_filled(void *ptr)
{
unsigned int pos;
@@ -135,21 +117,20 @@ static void zram_set_disksize(struct zram *zram, size_t totalram_bytes)

static void zram_free_page(struct zram *zram, size_t index)
{
- unsigned long handle = zram->table[index].handle;
- u16 size = zram->table[index].size;
+ unsigned long handle = zram->handle[index];
+ size_t size;

- if (unlikely(!handle)) {
- /*
- * No memory is allocated for zero filled pages.
- * Simply clear zero page flag.
- */
- if (zram_test_flag(zram, index, ZRAM_ZERO)) {
- zram_clear_flag(zram, index, ZRAM_ZERO);
- zram_stat_dec(&zram->stats.pages_zero);
- }
+ if (unlikely(!handle))
+ return;
+
+ if (handle == zero_page_handle) {
+ /* No memory is allocated for zero filled pages */
+ zram->handle[index] = 0;
+ zram_stat_dec(&zram->stats.pages_zero);
return;
}

+ size = zs_get_object_size(zram->mem_pool, handle);
if (unlikely(size > max_zpage_size))
zram_stat_dec(&zram->stats.bad_compress);

@@ -158,12 +139,10 @@ static void zram_free_page(struct zram *zram, size_t index)
if (size <= PAGE_SIZE / 2)
zram_stat_dec(&zram->stats.good_compress);

- zram_stat64_sub(zram, &zram->stats.compr_size,
- zram->table[index].size);
+ zram_stat64_sub(zram, &zram->stats.compr_size, size);
zram_stat_dec(&zram->stats.pages_stored);

- zram->table[index].handle = 0;
- zram->table[index].size = 0;
+ zram->handle[index] = 0;
}

static void handle_zero_page(struct bio_vec *bvec)
@@ -188,19 +167,20 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
int ret = LZO_E_OK;
size_t clen = PAGE_SIZE;
unsigned char *cmem;
- unsigned long handle = zram->table[index].handle;
+ unsigned long handle = zram->handle[index];
+ size_t objsize;

- if (!handle || zram_test_flag(zram, index, ZRAM_ZERO)) {
+ if (!handle || (handle == zero_page_handle)) {
memset(mem, 0, PAGE_SIZE);
return 0;
}

+ objsize = zs_get_object_size(zram->mem_pool, handle);
cmem = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
- if (zram->table[index].size == PAGE_SIZE)
+ if (objsize == PAGE_SIZE)
memcpy(mem, cmem, PAGE_SIZE);
else
- ret = lzo1x_decompress_safe(cmem, zram->table[index].size,
- mem, &clen);
+ ret = lzo1x_decompress_safe(cmem, objsize, mem, &clen);
zs_unmap_object(zram->mem_pool, handle);

/* Should NEVER happen. Return bio error if it does. */
@@ -222,8 +202,8 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,

page = bvec->bv_page;

- if (unlikely(!zram->table[index].handle) ||
- zram_test_flag(zram, index, ZRAM_ZERO)) {
+ if (unlikely(!zram->handle[index]) ||
+ (zram->handle[index] == zero_page_handle)) {
handle_zero_page(bvec);
return 0;
}
@@ -294,8 +274,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
* System overwrites unused sectors. Free memory associated
* with this sector now.
*/
- if (zram->table[index].handle ||
- zram_test_flag(zram, index, ZRAM_ZERO))
+ if (zram->handle[index])
zram_free_page(zram, index);

user_mem = kmap_atomic(page);
@@ -313,7 +292,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
if (!is_partial_io(bvec))
kunmap_atomic(user_mem);
zram_stat_inc(&zram->stats.pages_zero);
- zram_set_flag(zram, index, ZRAM_ZERO);
+ zram->handle[index] = zero_page_handle;
ret = 0;
goto out;
}
@@ -357,8 +336,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

zs_unmap_object(zram->mem_pool, handle);

- zram->table[index].handle = handle;
- zram->table[index].size = clen;
+ zram->handle[index] = handle;

/* Update stats */
zram_stat64_add(zram, &zram->stats.compr_size, clen);
@@ -517,15 +495,15 @@ void __zram_reset_device(struct zram *zram)

/* Free all pages that are still in this zram device */
for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
- unsigned long handle = zram->table[index].handle;
- if (!handle)
+ unsigned long handle = zram->handle[index];
+ if (!handle || (handle == zero_page_handle))
continue;

zs_free(zram->mem_pool, handle);
}

- vfree(zram->table);
- zram->table = NULL;
+ vfree(zram->handle);
+ zram->handle = NULL;

zs_destroy_pool(zram->mem_pool);
zram->mem_pool = NULL;
@@ -573,8 +551,8 @@ int zram_init_device(struct zram *zram)
}

num_pages = zram->disksize >> PAGE_SHIFT;
- zram->table = vzalloc(num_pages * sizeof(*zram->table));
- if (!zram->table) {
+ zram->handle = vzalloc(num_pages * sizeof(*zram->handle));
+ if (!zram->handle) {
pr_err("Error allocating zram address table\n");
ret = -ENOMEM;
goto fail_no_table;
diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h
index df2eec4..8aa733c 100644
--- a/drivers/staging/zram/zram_drv.h
+++ b/drivers/staging/zram/zram_drv.h
@@ -54,24 +54,10 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
#define ZRAM_SECTOR_PER_LOGICAL_BLOCK \
(1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT))

-/* Flags for zram pages (table[page_no].flags) */
-enum zram_pageflags {
- /* Page consists entirely of zeros */
- ZRAM_ZERO,
-
- __NR_ZRAM_PAGEFLAGS,
-};
+static const unsigned long zero_page_handle = (unsigned long)(-1);

/*-- Data structures */

-/* Allocated for each disk page */
-struct table {
- unsigned long handle;
- u16 size; /* object size (excluding header) */
- u8 count; /* object ref count (not yet used) */
- u8 flags;
-} __aligned(4);
-
struct zram_stats {
u64 compr_size; /* compressed size of pages stored */
u64 num_reads; /* failed + successful */
@@ -90,7 +76,7 @@ struct zram {
struct zs_pool *mem_pool;
void *compress_workmem;
void *compress_buffer;
- struct table *table;
+ unsigned long *handle; /* memory handle for each disk page */
spinlock_t stat64_lock; /* protect 64-bit stats */
struct rw_semaphore lock; /* protect compression buffers and table
* against concurrent read and writes */
--
1.7.10.4

2012-11-27 16:22:29

by Jerome Marchand

[permalink] [raw]
Subject: Re: [PATCH 2/2] zram: reduce metadata overhead

On 11/27/2012 08:26 AM, Nitin Gupta wrote:
> For every allocated object, zram maintains the the handle, size,
> flags and count fields. Of these, only the handle is required
> since zsmalloc now provides the object size given the handle.
> The flags field was needed only to mark a given page as zero-filled.
> Instead of this field, we now use an invalid value (-1) to mark such

Since the handle is unsigned, is this really an invalid value?

> pages. Lastly, the count field was unused, so was simply removed.
>
> Signed-off-by: Nitin Gupta <[email protected]>
> ---
> drivers/staging/zram/zram_drv.c | 80 ++++++++++++++-------------------------
> drivers/staging/zram/zram_drv.h | 18 +--------
> 2 files changed, 31 insertions(+), 67 deletions(-)
>

> [...]

> @@ -222,8 +202,8 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
>
> page = bvec->bv_page;
>
> - if (unlikely(!zram->table[index].handle) ||
> - zram_test_flag(zram, index, ZRAM_ZERO)) {
> + if (unlikely(!zram->handle[index]) ||
> + (zram->handle[index] == zero_page_handle)) {

This kind of comparison appears often. To my taste, an is_zero_page()
helper would be welcome.

> handle_zero_page(bvec);
> return 0;
> }

> [...]

> @@ -573,8 +551,8 @@ int zram_init_device(struct zram *zram)
> }
>
> num_pages = zram->disksize >> PAGE_SHIFT;
> - zram->table = vzalloc(num_pages * sizeof(*zram->table));
> - if (!zram->table) {
> + zram->handle = vzalloc(num_pages * sizeof(*zram->handle));
> + if (!zram->handle) {
> pr_err("Error allocating zram address table\n");
> ret = -ENOMEM;
> goto fail_no_table;

Since the table goes away, for readability sake, we'd better remove all
reference to it in messages, labels and comment and write "handle"
instead.

Otherwise, the patch looks good to me.

Jerome

> diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h
> index df2eec4..8aa733c 100644
> --- a/drivers/staging/zram/zram_drv.h
> +++ b/drivers/staging/zram/zram_drv.h
> @@ -54,24 +54,10 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
> #define ZRAM_SECTOR_PER_LOGICAL_BLOCK \
> (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT))
>
> -/* Flags for zram pages (table[page_no].flags) */
> -enum zram_pageflags {
> - /* Page consists entirely of zeros */
> - ZRAM_ZERO,
> -
> - __NR_ZRAM_PAGEFLAGS,
> -};
> +static const unsigned long zero_page_handle = (unsigned long)(-1);
>
> /*-- Data structures */
>
> -/* Allocated for each disk page */
> -struct table {
> - unsigned long handle;
> - u16 size; /* object size (excluding header) */
> - u8 count; /* object ref count (not yet used) */
> - u8 flags;
> -} __aligned(4);
> -
> struct zram_stats {
> u64 compr_size; /* compressed size of pages stored */
> u64 num_reads; /* failed + successful */
> @@ -90,7 +76,7 @@ struct zram {
> struct zs_pool *mem_pool;
> void *compress_workmem;
> void *compress_buffer;
> - struct table *table;
> + unsigned long *handle; /* memory handle for each disk page */
> spinlock_t stat64_lock; /* protect 64-bit stats */
> struct rw_semaphore lock; /* protect compression buffers and table
> * against concurrent read and writes */
>

2012-11-29 01:40:52

by Nitin Gupta

[permalink] [raw]
Subject: Re: [PATCH 2/2] zram: reduce metadata overhead

On 11/27/2012 08:22 AM, Jerome Marchand wrote:
> On 11/27/2012 08:26 AM, Nitin Gupta wrote:
>> For every allocated object, zram maintains the the handle, size,
>> flags and count fields. Of these, only the handle is required
>> since zsmalloc now provides the object size given the handle.
>> The flags field was needed only to mark a given page as zero-filled.
>> Instead of this field, we now use an invalid value (-1) to mark such
> Since the handle is unsigned, is this really an invalid value?
>

Great catch. -1 is not always invalid but +1 definitely is. I will fix this.

>> pages. Lastly, the count field was unused, so was simply removed.
>>
>> Signed-off-by: Nitin Gupta <[email protected]>
>> ---
>> drivers/staging/zram/zram_drv.c | 80 ++++++++++++++-------------------------
>> drivers/staging/zram/zram_drv.h | 18 +--------
>> 2 files changed, 31 insertions(+), 67 deletions(-)
>>
>> [...]
>> @@ -222,8 +202,8 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
>>
>> page = bvec->bv_page;
>>
>> - if (unlikely(!zram->table[index].handle) ||
>> - zram_test_flag(zram, index, ZRAM_ZERO)) {
>> + if (unlikely(!zram->handle[index]) ||
>> + (zram->handle[index] == zero_page_handle)) {
> This kind of comparison appears often. To my taste, an is_zero_page()
> helper would be welcome.

Actually, I was hit by assignment-instead-of-comparison bug once
during these changes, so this helper really should be there.

>> handle_zero_page(bvec);
>> return 0;
>> }
>> [...]
>> @@ -573,8 +551,8 @@ int zram_init_device(struct zram *zram)
>> }
>>
>> num_pages = zram->disksize >> PAGE_SHIFT;
>> - zram->table = vzalloc(num_pages * sizeof(*zram->table));
>> - if (!zram->table) {
>> + zram->handle = vzalloc(num_pages * sizeof(*zram->handle));
>> + if (!zram->handle) {
>> pr_err("Error allocating zram address table\n");
>> ret = -ENOMEM;
>> goto fail_no_table;
> Since the table goes away, for readability sake, we'd better remove all
> reference to it in messages, labels and comment and write "handle"
> instead.

Missed those messages, will fix that.

> Otherwise, the patch looks good to me.

Thanks for the review.
Nitin

>
>> diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h
>> index df2eec4..8aa733c 100644
>> --- a/drivers/staging/zram/zram_drv.h
>> +++ b/drivers/staging/zram/zram_drv.h
>> @@ -54,24 +54,10 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
>> #define ZRAM_SECTOR_PER_LOGICAL_BLOCK \
>> (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT))
>>
>> -/* Flags for zram pages (table[page_no].flags) */
>> -enum zram_pageflags {
>> - /* Page consists entirely of zeros */
>> - ZRAM_ZERO,
>> -
>> - __NR_ZRAM_PAGEFLAGS,
>> -};
>> +static const unsigned long zero_page_handle = (unsigned long)(-1);
>>
>> /*-- Data structures */
>>
>> -/* Allocated for each disk page */
>> -struct table {
>> - unsigned long handle;
>> - u16 size; /* object size (excluding header) */
>> - u8 count; /* object ref count (not yet used) */
>> - u8 flags;
>> -} __aligned(4);
>> -
>> struct zram_stats {
>> u64 compr_size; /* compressed size of pages stored */
>> u64 num_reads; /* failed + successful */
>> @@ -90,7 +76,7 @@ struct zram {
>> struct zs_pool *mem_pool;
>> void *compress_workmem;
>> void *compress_buffer;
>> - struct table *table;
>> + unsigned long *handle; /* memory handle for each disk page */
>> spinlock_t stat64_lock; /* protect 64-bit stats */
>> struct rw_semaphore lock; /* protect compression buffers and table
>> * against concurrent read and writes */
>>

2012-11-29 07:45:34

by Minchan Kim

[permalink] [raw]
Subject: Re: [PATCH 1/2] zsmalloc: add function to query object size

On Mon, Nov 26, 2012 at 11:26:40PM -0800, Nitin Gupta wrote:
> Adds zs_get_object_size(handle) which provides the size of
> the given object. This is useful since the user (zram etc.)
> now do not have to maintain object sizes separately, saving
> on some metadata size (4b per page).
>
> The object handle encodes <page, offset> pair which currently points

Nitpick.

<page, index> in descrption would be proper instead of
<page, offset>. You are going to replace <page, idx> with <page, offset>.

> to the start of the object. Now, the handle implicitly stores the size
> information by pointing to the object's end instead. Since zsmalloc is
> a slab based allocator, the start of the object can be easily determined
> and the difference between the end offset encoded in the handle and the
> start gives us the object size.

It's a good idea. Look at just minor comment below.

Let's talk with another concern. This patch surely helps current
customer's memory usage who should add 4 byte for accounting the
statistics while zsmalloc could be slow down.
Is it really valuable?

Yes. zram/zcache had a tight coupling with zsmalloc so it already
lost modularity. :( In this POV, this patch makes sense.
But if someone are willing to remove statistics for performance?
Although he remove it, zsmalloc is still slow down.

Statistic for user of zsmalloc should be cost of user himself, not zsmalloc
and it accelerates dependency with customer so it makes changing allocator
hard in future. We already had such experience(xvmalloc->zsmalloc). Of course,
it's not good that worry future things too early without any plan.
So I'm not strong againt you. If any reviewer don't raise an eyebrow,
I wil rely on your decision.

Thanks.

>
> Signed-off-by: Nitin Gupta <[email protected]>
> ---
> drivers/staging/zsmalloc/zsmalloc-main.c | 177 +++++++++++++++++++++---------
> drivers/staging/zsmalloc/zsmalloc.h | 1 +
> 2 files changed, 127 insertions(+), 51 deletions(-)
>
> diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c
> index 09a9d35..65c9d3b 100644
> --- a/drivers/staging/zsmalloc/zsmalloc-main.c
> +++ b/drivers/staging/zsmalloc/zsmalloc-main.c
> @@ -112,20 +112,20 @@
> #define MAX_PHYSMEM_BITS 36
> #else /* !CONFIG_HIGHMEM64G */
> /*
> - * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
> + * If this definition of MAX_PHYSMEM_BITS is used, OFFSET_BITS will just
> * be PAGE_SHIFT
> */
> #define MAX_PHYSMEM_BITS BITS_PER_LONG
> #endif
> #endif
> #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
> -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
> -#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
> +#define OFFSET_BITS (BITS_PER_LONG - _PFN_BITS)
> +#define OFFSET_MASK ((_AC(1, UL) << OFFSET_BITS) - 1)
>
> #define MAX(a, b) ((a) >= (b) ? (a) : (b))
> /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
> #define ZS_MIN_ALLOC_SIZE \
> - MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
> + MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OFFSET_BITS))
> #define ZS_MAX_ALLOC_SIZE PAGE_SIZE
>
> /*
> @@ -256,6 +256,11 @@ static int is_last_page(struct page *page)
> return PagePrivate2(page);
> }
>
> +static unsigned long get_page_index(struct page *page)

IMO, first_obj_offset(struct page *page) would be better readable.

> +{
> + return is_first_page(page) ? 0 : page->index;
> +}
> +
> static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
> enum fullness_group *fullness)
> {
> @@ -433,39 +438,86 @@ static struct page *get_next_page(struct page *page)
> return next;
> }
>
> -/* Encode <page, obj_idx> as a single handle value */
> -static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
> +static struct page *get_prev_page(struct page *page)
> {
> - unsigned long handle;
> + struct page *prev, *first_page;
>
> - if (!page) {
> - BUG_ON(obj_idx);
> - return NULL;
> - }
> + first_page = get_first_page(page);
> + if (page == first_page)
> + prev = NULL;
> + else if (page == (struct page *)first_page->private)
> + prev = first_page;
> + else
> + prev = list_entry(page->lru.prev, struct page, lru);
>
> - handle = page_to_pfn(page) << OBJ_INDEX_BITS;
> - handle |= (obj_idx & OBJ_INDEX_MASK);
> + return prev;
>
> - return (void *)handle;
> }
>
> -/* Decode <page, obj_idx> pair from the given object handle */
> -static void obj_handle_to_location(unsigned long handle, struct page **page,
> - unsigned long *obj_idx)
> +static void *encode_ptr(struct page *page, unsigned long offset)
> {
> - *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
> - *obj_idx = handle & OBJ_INDEX_MASK;
> + unsigned long ptr;
> + ptr = page_to_pfn(page) << OFFSET_BITS;
> + ptr |= offset & OFFSET_MASK;
> + return (void *)ptr;
> +}
> +
> +static void decode_ptr(unsigned long ptr, struct page **page,
> + unsigned int *offset)
> +{
> + *page = pfn_to_page(ptr >> OFFSET_BITS);
> + *offset = ptr & OFFSET_MASK;
> +}
> +
> +static struct page *obj_handle_to_page(unsigned long handle)
> +{
> + struct page *page;
> + unsigned int offset;
> +
> + decode_ptr(handle, &page, &offset);
> + if (offset < get_page_index(page))
> + page = get_prev_page(page);
> +
> + return page;
> +}
> +
> +static unsigned int obj_handle_to_offset(unsigned long handle,
> + unsigned int class_size)
> +{
> + struct page *page;
> + unsigned int offset;
> +
> + decode_ptr(handle, &page, &offset);
> + if (offset < get_page_index(page))
> + offset = PAGE_SIZE - class_size + get_page_index(page);

Althoug it's trivial, we can reduce get_page_index calling.

> + else
> + offset = roundup(offset, class_size) - class_size;
> +
> + return offset;
> }
>
> -static unsigned long obj_idx_to_offset(struct page *page,
> - unsigned long obj_idx, int class_size)
> +/* Encode <page, offset, size> as a single handle value */

Need more kind comment about encoding scheme with obj's end <page, offset>

> +static void *obj_location_to_handle(struct page *page, unsigned int offset,
> + unsigned int size, unsigned int class_size)
> {
> - unsigned long off = 0;
> + struct page *endpage;
> + unsigned int endoffset;
>
> - if (!is_first_page(page))
> - off = page->index;
> + if (!page) {
> + BUG_ON(offset);
> + return NULL;
> + }

What do you expect to catch with above check?

> + BUG_ON(offset >= PAGE_SIZE);
> +
> + endpage = page;
> + endoffset = offset + size - 1;
> + if (endoffset >= PAGE_SIZE) {
> + endpage = get_next_page(page);
> + BUG_ON(!endpage);
> + endoffset -= PAGE_SIZE;
> + }
>
> - return off + obj_idx * class_size;
> + return encode_ptr(endpage, endoffset);
> }
>
> static void reset_page(struct page *page)
> @@ -506,14 +558,13 @@ static void free_zspage(struct page *first_page)
> /* Initialize a newly allocated zspage */
> static void init_zspage(struct page *first_page, struct size_class *class)
> {
> - unsigned long off = 0;
> + unsigned long off = 0, next_off = 0;
> struct page *page = first_page;
>
> BUG_ON(!is_first_page(first_page));
> while (page) {
> struct page *next_page;
> struct link_free *link;
> - unsigned int i, objs_on_page;
>
> /*
> * page->index stores offset of first object starting
> @@ -526,14 +577,12 @@ static void init_zspage(struct page *first_page, struct size_class *class)
>
> link = (struct link_free *)kmap_atomic(page) +
> off / sizeof(*link);
> - objs_on_page = (PAGE_SIZE - off) / class->size;
>
> - for (i = 1; i <= objs_on_page; i++) {
> - off += class->size;
> - if (off < PAGE_SIZE) {
> - link->next = obj_location_to_handle(page, i);
> - link += class->size / sizeof(*link);
> - }
> + next_off = off + class->size;
> + while (next_off < PAGE_SIZE) {
> + link->next = encode_ptr(page, next_off);
> + link += class->size / sizeof(*link);
> + next_off += class->size;
> }
>
> /*
> @@ -542,10 +591,11 @@ static void init_zspage(struct page *first_page, struct size_class *class)
> * page (if present)
> */
> next_page = get_next_page(page);
> - link->next = obj_location_to_handle(next_page, 0);
> + next_off = next_page ? next_off - PAGE_SIZE : 0;
> + link->next = encode_ptr(next_page, next_off);
> kunmap_atomic(link);
> page = next_page;
> - off = (off + class->size) % PAGE_SIZE;
> + off = next_off;
> }
> }
>
> @@ -596,7 +646,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
>
> init_zspage(first_page, class);
>
> - first_page->freelist = obj_location_to_handle(first_page, 0);
> + first_page->freelist = encode_ptr(first_page, 0);
> /* Maximum number of objects we can store in this zspage */
> first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
>
> @@ -871,7 +921,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> struct size_class *class;
>
> struct page *first_page, *m_page;
> - unsigned long m_objidx, m_offset;
> + unsigned int m_offset;
>
> if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
> return 0;
> @@ -895,8 +945,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> }
>
> obj = (unsigned long)first_page->freelist;
> - obj_handle_to_location(obj, &m_page, &m_objidx);
> - m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
> + decode_ptr(obj, &m_page, &m_offset);
>
> link = (struct link_free *)kmap_atomic(m_page) +
> m_offset / sizeof(*link);
> @@ -907,6 +956,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> first_page->inuse++;
> /* Now move the zspage to another fullness group, if required */
> fix_fullness_group(pool, first_page);
> +
> + obj = (unsigned long)obj_location_to_handle(m_page, m_offset,
> + size, class->size);
> spin_unlock(&class->lock);
>
> return obj;
> @@ -917,7 +969,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> {
> struct link_free *link;
> struct page *first_page, *f_page;
> - unsigned long f_objidx, f_offset;
> + unsigned long f_offset;
>
> int class_idx;
> struct size_class *class;
> @@ -926,12 +978,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> if (unlikely(!obj))
> return;
>
> - obj_handle_to_location(obj, &f_page, &f_objidx);
> + f_page = obj_handle_to_page(obj);
> first_page = get_first_page(f_page);
>
> get_zspage_mapping(first_page, &class_idx, &fullness);
> class = &pool->size_class[class_idx];
> - f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
> + f_offset = obj_handle_to_offset(obj, class->size);
>
> spin_lock(&class->lock);
>
> @@ -940,7 +992,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> + f_offset);
> link->next = first_page->freelist;
> kunmap_atomic(link);
> - first_page->freelist = (void *)obj;
> + first_page->freelist = encode_ptr(f_page, f_offset);
>
> first_page->inuse--;
> fullness = fix_fullness_group(pool, first_page);
> @@ -970,10 +1022,10 @@ EXPORT_SYMBOL_GPL(zs_free);
> * This function returns with preemption and page faults disabled.
> */
> void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> - enum zs_mapmode mm)
> + enum zs_mapmode mm)
> {
> struct page *page;
> - unsigned long obj_idx, off;
> + unsigned long off;
>
> unsigned int class_idx;
> enum fullness_group fg;
> @@ -990,10 +1042,10 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> */
> BUG_ON(in_interrupt());
>
> - obj_handle_to_location(handle, &page, &obj_idx);
> + page = obj_handle_to_page(handle);
> get_zspage_mapping(get_first_page(page), &class_idx, &fg);
> class = &pool->size_class[class_idx];
> - off = obj_idx_to_offset(page, obj_idx, class->size);
> + off = obj_handle_to_offset(handle, class->size);
>
> area = &get_cpu_var(zs_map_area);
> area->vm_mm = mm;
> @@ -1015,7 +1067,7 @@ EXPORT_SYMBOL_GPL(zs_map_object);
> void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
> {
> struct page *page;
> - unsigned long obj_idx, off;
> + unsigned long off;
>
> unsigned int class_idx;
> enum fullness_group fg;
> @@ -1024,10 +1076,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>
> BUG_ON(!handle);
>
> - obj_handle_to_location(handle, &page, &obj_idx);
> + page = obj_handle_to_page(handle);
> get_zspage_mapping(get_first_page(page), &class_idx, &fg);
> class = &pool->size_class[class_idx];
> - off = obj_idx_to_offset(page, obj_idx, class->size);
> + off = obj_handle_to_offset(handle, class->size);
>
> area = &__get_cpu_var(zs_map_area);
> if (off + class->size <= PAGE_SIZE)
> @@ -1045,6 +1097,29 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
> }
> EXPORT_SYMBOL_GPL(zs_unmap_object);
>
> +size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle)
> +{
> + struct page *endpage;
> + unsigned int endoffset, size;
> +
> + unsigned int class_idx;
> + enum fullness_group fg;
> + struct size_class *class;
> +
> + decode_ptr(handle, &endpage, &endoffset);
> + get_zspage_mapping(endpage, &class_idx, &fg);
> + class = &pool->size_class[class_idx];
> +
> + size = endoffset + 1;
> + if (endoffset < get_page_index(endpage))
> + size += class->size - get_page_index(endpage);
> + else
> + size -= rounddown(endoffset, class->size);
> +
> + return size;
> +}
> +EXPORT_SYMBOL_GPL(zs_get_object_size);
> +
> u64 zs_get_total_size_bytes(struct zs_pool *pool)
> {
> int i;
> diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/drivers/staging/zsmalloc/zsmalloc.h
> index de2e8bf..2830fdf 100644
> --- a/drivers/staging/zsmalloc/zsmalloc.h
> +++ b/drivers/staging/zsmalloc/zsmalloc.h
> @@ -38,6 +38,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> enum zs_mapmode mm);
> void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
>
> +size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle);
> u64 zs_get_total_size_bytes(struct zs_pool *pool);
>
> #endif
> --
> 1.7.10.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Kind regards,
Minchan Kim

2012-11-29 09:10:10

by Nitin Gupta

[permalink] [raw]
Subject: Re: [PATCH 1/2] zsmalloc: add function to query object size

On 11/28/2012 11:45 PM, Minchan Kim wrote:
> On Mon, Nov 26, 2012 at 11:26:40PM -0800, Nitin Gupta wrote:
>> Adds zs_get_object_size(handle) which provides the size of
>> the given object. This is useful since the user (zram etc.)
>> now do not have to maintain object sizes separately, saving
>> on some metadata size (4b per page).
>>
>> The object handle encodes <page, offset> pair which currently points
>
> Nitpick.
>
> <page, index> in descrption would be proper instead of
> <page, offset>. You are going to replace <page, idx> with <page, offset>.
>

I think 'offset' conveys the meaning more clearly; 'index' is after all
just a chopped-off version of offset :)


>> to the start of the object. Now, the handle implicitly stores the size
>> information by pointing to the object's end instead. Since zsmalloc is
>> a slab based allocator, the start of the object can be easily determined
>> and the difference between the end offset encoded in the handle and the
>> start gives us the object size.
>
> It's a good idea. Look at just minor comment below.
>
> Let's talk with another concern. This patch surely helps current
> customer's memory usage who should add 4 byte for accounting the
> statistics while zsmalloc could be slow down.
> Is it really valuable?
>
> Yes. zram/zcache had a tight coupling with zsmalloc so it already
> lost modularity. :( In this POV, this patch makes sense.
> But if someone are willing to remove statistics for performance?
> Although he remove it, zsmalloc is still slow down.
>
> Statistic for user of zsmalloc should be cost of user himself, not zsmalloc
> and it accelerates dependency with customer so it makes changing allocator
> hard in future. We already had such experience(xvmalloc->zsmalloc). Of course,
> it's not good that worry future things too early without any plan.
> So I'm not strong againt you. If any reviewer don't raise an eyebrow,
> I wil rely on your decision.
>

Looking over the changes I do not expect any difference in performance
-- just a bit more arithmetic, however the use of get_prev_page() which
may dereference a few extra pointers might not be really free. Also, my
iozone test[1] shows very little difference in performance:

With just the fix for crash (patch 1/2):
9.28user 1159.84system 21:46.54elapsed 89%CPU (0avgtext+0avgdata
50200maxresident)k
212988544inputs+190660816outputs (1major+16706minor)pagefaults 0swaps

With the new get_object_size() code (patch 1/2 + patch 2/2):
9.20user 1112.63system 21:03.61elapsed 88%CPU (0avgtext+0avgdata
50200maxresident)k
194636568inputs+190500424outputs (1major+16707minor)pagefaults 0swaps

I really cannot explain this ~40s speedup but anyways, I think
optimizing zsmalloc/zram should be taken up separately, at least when
this new code does not seem to have any significant effects.


[1] iozone test:
- Create zram of size 1200m
- Create ext4 fs
- iozone -a -g 1G
- In parallel: watch zram_stress

# zram_stress
sync
echo 1 | sudo tee /proc/sys/vm/drop_caches


Surely, not the most accurate of the tests but gives an idea if anything
is making a significant difference.

Thanks,
Nitin


>
>>
>> Signed-off-by: Nitin Gupta <[email protected]>
>> ---
>> drivers/staging/zsmalloc/zsmalloc-main.c | 177 +++++++++++++++++++++---------
>> drivers/staging/zsmalloc/zsmalloc.h | 1 +
>> 2 files changed, 127 insertions(+), 51 deletions(-)
>>
>> diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c
>> index 09a9d35..65c9d3b 100644
>> --- a/drivers/staging/zsmalloc/zsmalloc-main.c
>> +++ b/drivers/staging/zsmalloc/zsmalloc-main.c
>> @@ -112,20 +112,20 @@
>> #define MAX_PHYSMEM_BITS 36
>> #else /* !CONFIG_HIGHMEM64G */
>> /*
>> - * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
>> + * If this definition of MAX_PHYSMEM_BITS is used, OFFSET_BITS will just
>> * be PAGE_SHIFT
>> */
>> #define MAX_PHYSMEM_BITS BITS_PER_LONG
>> #endif
>> #endif
>> #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
>> -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
>> -#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
>> +#define OFFSET_BITS (BITS_PER_LONG - _PFN_BITS)
>> +#define OFFSET_MASK ((_AC(1, UL) << OFFSET_BITS) - 1)
>>
>> #define MAX(a, b) ((a) >= (b) ? (a) : (b))
>> /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
>> #define ZS_MIN_ALLOC_SIZE \
>> - MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
>> + MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OFFSET_BITS))
>> #define ZS_MAX_ALLOC_SIZE PAGE_SIZE
>>
>> /*
>> @@ -256,6 +256,11 @@ static int is_last_page(struct page *page)
>> return PagePrivate2(page);
>> }
>>
>> +static unsigned long get_page_index(struct page *page)
>
> IMO, first_obj_offset(struct page *page) would be better readable.
>
>> +{
>> + return is_first_page(page) ? 0 : page->index;
>> +}
>> +
>> static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
>> enum fullness_group *fullness)
>> {
>> @@ -433,39 +438,86 @@ static struct page *get_next_page(struct page *page)
>> return next;
>> }
>>
>> -/* Encode <page, obj_idx> as a single handle value */
>> -static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
>> +static struct page *get_prev_page(struct page *page)
>> {
>> - unsigned long handle;
>> + struct page *prev, *first_page;
>>
>> - if (!page) {
>> - BUG_ON(obj_idx);
>> - return NULL;
>> - }
>> + first_page = get_first_page(page);
>> + if (page == first_page)
>> + prev = NULL;
>> + else if (page == (struct page *)first_page->private)
>> + prev = first_page;
>> + else
>> + prev = list_entry(page->lru.prev, struct page, lru);
>>
>> - handle = page_to_pfn(page) << OBJ_INDEX_BITS;
>> - handle |= (obj_idx & OBJ_INDEX_MASK);
>> + return prev;
>>
>> - return (void *)handle;
>> }
>>
>> -/* Decode <page, obj_idx> pair from the given object handle */
>> -static void obj_handle_to_location(unsigned long handle, struct page **page,
>> - unsigned long *obj_idx)
>> +static void *encode_ptr(struct page *page, unsigned long offset)
>> {
>> - *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
>> - *obj_idx = handle & OBJ_INDEX_MASK;
>> + unsigned long ptr;
>> + ptr = page_to_pfn(page) << OFFSET_BITS;
>> + ptr |= offset & OFFSET_MASK;
>> + return (void *)ptr;
>> +}
>> +
>> +static void decode_ptr(unsigned long ptr, struct page **page,
>> + unsigned int *offset)
>> +{
>> + *page = pfn_to_page(ptr >> OFFSET_BITS);
>> + *offset = ptr & OFFSET_MASK;
>> +}
>> +
>> +static struct page *obj_handle_to_page(unsigned long handle)
>> +{
>> + struct page *page;
>> + unsigned int offset;
>> +
>> + decode_ptr(handle, &page, &offset);
>> + if (offset < get_page_index(page))
>> + page = get_prev_page(page);
>> +
>> + return page;
>> +}
>> +
>> +static unsigned int obj_handle_to_offset(unsigned long handle,
>> + unsigned int class_size)
>> +{
>> + struct page *page;
>> + unsigned int offset;
>> +
>> + decode_ptr(handle, &page, &offset);
>> + if (offset < get_page_index(page))
>> + offset = PAGE_SIZE - class_size + get_page_index(page);
>
> Althoug it's trivial, we can reduce get_page_index calling.
>
>> + else
>> + offset = roundup(offset, class_size) - class_size;
>> +
>> + return offset;
>> }
>>
>> -static unsigned long obj_idx_to_offset(struct page *page,
>> - unsigned long obj_idx, int class_size)
>> +/* Encode <page, offset, size> as a single handle value */
>
> Need more kind comment about encoding scheme with obj's end <page, offset>
>
>> +static void *obj_location_to_handle(struct page *page, unsigned int offset,
>> + unsigned int size, unsigned int class_size)
>> {
>> - unsigned long off = 0;
>> + struct page *endpage;
>> + unsigned int endoffset;
>>
>> - if (!is_first_page(page))
>> - off = page->index;
>> + if (!page) {
>> + BUG_ON(offset);
>> + return NULL;
>> + }
>
> What do you expect to catch with above check?
>
>> + BUG_ON(offset >= PAGE_SIZE);
>> +
>> + endpage = page;
>> + endoffset = offset + size - 1;
>> + if (endoffset >= PAGE_SIZE) {
>> + endpage = get_next_page(page);
>> + BUG_ON(!endpage);
>> + endoffset -= PAGE_SIZE;
>> + }
>>
>> - return off + obj_idx * class_size;
>> + return encode_ptr(endpage, endoffset);
>> }
>>
>> static void reset_page(struct page *page)
>> @@ -506,14 +558,13 @@ static void free_zspage(struct page *first_page)
>> /* Initialize a newly allocated zspage */
>> static void init_zspage(struct page *first_page, struct size_class *class)
>> {
>> - unsigned long off = 0;
>> + unsigned long off = 0, next_off = 0;
>> struct page *page = first_page;
>>
>> BUG_ON(!is_first_page(first_page));
>> while (page) {
>> struct page *next_page;
>> struct link_free *link;
>> - unsigned int i, objs_on_page;
>>
>> /*
>> * page->index stores offset of first object starting
>> @@ -526,14 +577,12 @@ static void init_zspage(struct page *first_page, struct size_class *class)
>>
>> link = (struct link_free *)kmap_atomic(page) +
>> off / sizeof(*link);
>> - objs_on_page = (PAGE_SIZE - off) / class->size;
>>
>> - for (i = 1; i <= objs_on_page; i++) {
>> - off += class->size;
>> - if (off < PAGE_SIZE) {
>> - link->next = obj_location_to_handle(page, i);
>> - link += class->size / sizeof(*link);
>> - }
>> + next_off = off + class->size;
>> + while (next_off < PAGE_SIZE) {
>> + link->next = encode_ptr(page, next_off);
>> + link += class->size / sizeof(*link);
>> + next_off += class->size;
>> }
>>
>> /*
>> @@ -542,10 +591,11 @@ static void init_zspage(struct page *first_page, struct size_class *class)
>> * page (if present)
>> */
>> next_page = get_next_page(page);
>> - link->next = obj_location_to_handle(next_page, 0);
>> + next_off = next_page ? next_off - PAGE_SIZE : 0;
>> + link->next = encode_ptr(next_page, next_off);
>> kunmap_atomic(link);
>> page = next_page;
>> - off = (off + class->size) % PAGE_SIZE;
>> + off = next_off;
>> }
>> }
>>
>> @@ -596,7 +646,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
>>
>> init_zspage(first_page, class);
>>
>> - first_page->freelist = obj_location_to_handle(first_page, 0);
>> + first_page->freelist = encode_ptr(first_page, 0);
>> /* Maximum number of objects we can store in this zspage */
>> first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
>>
>> @@ -871,7 +921,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>> struct size_class *class;
>>
>> struct page *first_page, *m_page;
>> - unsigned long m_objidx, m_offset;
>> + unsigned int m_offset;
>>
>> if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
>> return 0;
>> @@ -895,8 +945,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>> }
>>
>> obj = (unsigned long)first_page->freelist;
>> - obj_handle_to_location(obj, &m_page, &m_objidx);
>> - m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
>> + decode_ptr(obj, &m_page, &m_offset);
>>
>> link = (struct link_free *)kmap_atomic(m_page) +
>> m_offset / sizeof(*link);
>> @@ -907,6 +956,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>> first_page->inuse++;
>> /* Now move the zspage to another fullness group, if required */
>> fix_fullness_group(pool, first_page);
>> +
>> + obj = (unsigned long)obj_location_to_handle(m_page, m_offset,
>> + size, class->size);
>> spin_unlock(&class->lock);
>>
>> return obj;
>> @@ -917,7 +969,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
>> {
>> struct link_free *link;
>> struct page *first_page, *f_page;
>> - unsigned long f_objidx, f_offset;
>> + unsigned long f_offset;
>>
>> int class_idx;
>> struct size_class *class;
>> @@ -926,12 +978,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
>> if (unlikely(!obj))
>> return;
>>
>> - obj_handle_to_location(obj, &f_page, &f_objidx);
>> + f_page = obj_handle_to_page(obj);
>> first_page = get_first_page(f_page);
>>
>> get_zspage_mapping(first_page, &class_idx, &fullness);
>> class = &pool->size_class[class_idx];
>> - f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
>> + f_offset = obj_handle_to_offset(obj, class->size);
>>
>> spin_lock(&class->lock);
>>
>> @@ -940,7 +992,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
>> + f_offset);
>> link->next = first_page->freelist;
>> kunmap_atomic(link);
>> - first_page->freelist = (void *)obj;
>> + first_page->freelist = encode_ptr(f_page, f_offset);
>>
>> first_page->inuse--;
>> fullness = fix_fullness_group(pool, first_page);
>> @@ -970,10 +1022,10 @@ EXPORT_SYMBOL_GPL(zs_free);
>> * This function returns with preemption and page faults disabled.
>> */
>> void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>> - enum zs_mapmode mm)
>> + enum zs_mapmode mm)
>> {
>> struct page *page;
>> - unsigned long obj_idx, off;
>> + unsigned long off;
>>
>> unsigned int class_idx;
>> enum fullness_group fg;
>> @@ -990,10 +1042,10 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>> */
>> BUG_ON(in_interrupt());
>>
>> - obj_handle_to_location(handle, &page, &obj_idx);
>> + page = obj_handle_to_page(handle);
>> get_zspage_mapping(get_first_page(page), &class_idx, &fg);
>> class = &pool->size_class[class_idx];
>> - off = obj_idx_to_offset(page, obj_idx, class->size);
>> + off = obj_handle_to_offset(handle, class->size);
>>
>> area = &get_cpu_var(zs_map_area);
>> area->vm_mm = mm;
>> @@ -1015,7 +1067,7 @@ EXPORT_SYMBOL_GPL(zs_map_object);
>> void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>> {
>> struct page *page;
>> - unsigned long obj_idx, off;
>> + unsigned long off;
>>
>> unsigned int class_idx;
>> enum fullness_group fg;
>> @@ -1024,10 +1076,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>>
>> BUG_ON(!handle);
>>
>> - obj_handle_to_location(handle, &page, &obj_idx);
>> + page = obj_handle_to_page(handle);
>> get_zspage_mapping(get_first_page(page), &class_idx, &fg);
>> class = &pool->size_class[class_idx];
>> - off = obj_idx_to_offset(page, obj_idx, class->size);
>> + off = obj_handle_to_offset(handle, class->size);
>>
>> area = &__get_cpu_var(zs_map_area);
>> if (off + class->size <= PAGE_SIZE)
>> @@ -1045,6 +1097,29 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>> }
>> EXPORT_SYMBOL_GPL(zs_unmap_object);
>>
>> +size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle)
>> +{
>> + struct page *endpage;
>> + unsigned int endoffset, size;
>> +
>> + unsigned int class_idx;
>> + enum fullness_group fg;
>> + struct size_class *class;
>> +
>> + decode_ptr(handle, &endpage, &endoffset);
>> + get_zspage_mapping(endpage, &class_idx, &fg);
>> + class = &pool->size_class[class_idx];
>> +
>> + size = endoffset + 1;
>> + if (endoffset < get_page_index(endpage))
>> + size += class->size - get_page_index(endpage);
>> + else
>> + size -= rounddown(endoffset, class->size);
>> +
>> + return size;
>> +}
>> +EXPORT_SYMBOL_GPL(zs_get_object_size);
>> +
>> u64 zs_get_total_size_bytes(struct zs_pool *pool)
>> {
>> int i;
>> diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/drivers/staging/zsmalloc/zsmalloc.h
>> index de2e8bf..2830fdf 100644
>> --- a/drivers/staging/zsmalloc/zsmalloc.h
>> +++ b/drivers/staging/zsmalloc/zsmalloc.h
>> @@ -38,6 +38,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>> enum zs_mapmode mm);
>> void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
>>
>> +size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle);
>> u64 zs_get_total_size_bytes(struct zs_pool *pool);
>>
>> #endif
>> --
>> 1.7.10.4
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>

2012-11-30 00:03:13

by Minchan Kim

[permalink] [raw]
Subject: Re: [PATCH 1/2] zsmalloc: add function to query object size

On Thu, Nov 29, 2012 at 01:09:56AM -0800, Nitin Gupta wrote:
> On 11/28/2012 11:45 PM, Minchan Kim wrote:
> >On Mon, Nov 26, 2012 at 11:26:40PM -0800, Nitin Gupta wrote:
> >>Adds zs_get_object_size(handle) which provides the size of
> >>the given object. This is useful since the user (zram etc.)
> >>now do not have to maintain object sizes separately, saving
> >>on some metadata size (4b per page).
> >>
> >>The object handle encodes <page, offset> pair which currently points
> >
> >Nitpick.
> >
> ><page, index> in descrption would be proper instead of
> ><page, offset>. You are going to replace <page, idx> with <page, offset>.
> >
>
> I think 'offset' conveys the meaning more clearly; 'index' is after
> all just a chopped-off version of offset :)

In my perceptoin, offset means location from some point by some byte
while index is thing we have to multiply sizeof(object) to get.
So you used index before the patch but now you try to use offset instead of
index.

Anyway, it's minor nitpick. Never mind if you don't agree. :)

>
>
> >>to the start of the object. Now, the handle implicitly stores the size
> >>information by pointing to the object's end instead. Since zsmalloc is
> >>a slab based allocator, the start of the object can be easily determined
> >>and the difference between the end offset encoded in the handle and the
> >>start gives us the object size.
> >
> >It's a good idea. Look at just minor comment below.
> >
> >Let's talk with another concern. This patch surely helps current
> >customer's memory usage who should add 4 byte for accounting the
> >statistics while zsmalloc could be slow down.
> >Is it really valuable?
> >
> >Yes. zram/zcache had a tight coupling with zsmalloc so it already
> >lost modularity. :( In this POV, this patch makes sense.
> >But if someone are willing to remove statistics for performance?
> >Although he remove it, zsmalloc is still slow down.
> >
> >Statistic for user of zsmalloc should be cost of user himself, not zsmalloc
> >and it accelerates dependency with customer so it makes changing allocator
> >hard in future. We already had such experience(xvmalloc->zsmalloc). Of course,
> >it's not good that worry future things too early without any plan.
> >So I'm not strong againt you. If any reviewer don't raise an eyebrow,
> >I wil rely on your decision.
> >
>
> Looking over the changes I do not expect any difference in
> performance -- just a bit more arithmetic, however the use of
> get_prev_page() which may dereference a few extra pointers might not
> be really free. Also, my iozone test[1] shows very little difference
> in performance:

Iozone test isn't enough to prove the minor slow down because it would have
many noise about I/O path and different compression ratio per I/O.

>
> With just the fix for crash (patch 1/2):
> 9.28user 1159.84system 21:46.54elapsed 89%CPU (0avgtext+0avgdata
> 50200maxresident)k
> 212988544inputs+190660816outputs (1major+16706minor)pagefaults 0swaps
>
> With the new get_object_size() code (patch 1/2 + patch 2/2):
> 9.20user 1112.63system 21:03.61elapsed 88%CPU (0avgtext+0avgdata
> 50200maxresident)k
> 194636568inputs+190500424outputs (1major+16707minor)pagefaults 0swaps
>
> I really cannot explain this ~40s speedup but anyways, I think
> optimizing zsmalloc/zram should be taken up separately, at least
> when this new code does not seem to have any significant effects.
>
>
> [1] iozone test:
> - Create zram of size 1200m
> - Create ext4 fs
> - iozone -a -g 1G
> - In parallel: watch zram_stress
>
> # zram_stress
> sync
> echo 1 | sudo tee /proc/sys/vm/drop_caches
>
>
> Surely, not the most accurate of the tests but gives an idea if
> anything is making a significant difference.

For more accurate test, it would be better to use zsmapbench by Seth.
https://github.com/spartacus06/zsmapbench
Frankly speaking, I don't expect it has a significant regression as you said.
More concern to me is we are going to make tight coupling with zram/zcache +
zsmalloc. It makes changing from zsmalloc to smarter allocator hard.
The reason I have a such concern is that I have a TODO which supports
swap-over-zram pages migratin to storage swap when zram is full. Yes. It's a
just plan, no schedule at the moment so I can't insist on but if i try it in
future, I might want to replace zsmalloc to another. In case of that, tight
coupling would be a another hurdle.
But as I mentioned, I shouldn't prevent your great work by my ideal plan which
even don't have real schedule. So still I can follow your opinion.
Please resend the patch with fix if you think it's really worthy. :)

Thanks!

>
> Thanks,
> Nitin
>
>
> >
> >>
> >>Signed-off-by: Nitin Gupta <[email protected]>
> >>---
> >> drivers/staging/zsmalloc/zsmalloc-main.c | 177 +++++++++++++++++++++---------
> >> drivers/staging/zsmalloc/zsmalloc.h | 1 +
> >> 2 files changed, 127 insertions(+), 51 deletions(-)
> >>
> >>diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c
> >>index 09a9d35..65c9d3b 100644
> >>--- a/drivers/staging/zsmalloc/zsmalloc-main.c
> >>+++ b/drivers/staging/zsmalloc/zsmalloc-main.c
> >>@@ -112,20 +112,20 @@
> >> #define MAX_PHYSMEM_BITS 36
> >> #else /* !CONFIG_HIGHMEM64G */
> >> /*
> >>- * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
> >>+ * If this definition of MAX_PHYSMEM_BITS is used, OFFSET_BITS will just
> >> * be PAGE_SHIFT
> >> */
> >> #define MAX_PHYSMEM_BITS BITS_PER_LONG
> >> #endif
> >> #endif
> >> #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
> >>-#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
> >>-#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
> >>+#define OFFSET_BITS (BITS_PER_LONG - _PFN_BITS)
> >>+#define OFFSET_MASK ((_AC(1, UL) << OFFSET_BITS) - 1)
> >>
> >> #define MAX(a, b) ((a) >= (b) ? (a) : (b))
> >> /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
> >> #define ZS_MIN_ALLOC_SIZE \
> >>- MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
> >>+ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OFFSET_BITS))
> >> #define ZS_MAX_ALLOC_SIZE PAGE_SIZE
> >>
> >> /*
> >>@@ -256,6 +256,11 @@ static int is_last_page(struct page *page)
> >> return PagePrivate2(page);
> >> }
> >>
> >>+static unsigned long get_page_index(struct page *page)
> >
> >IMO, first_obj_offset(struct page *page) would be better readable.
> >
> >>+{
> >>+ return is_first_page(page) ? 0 : page->index;
> >>+}
> >>+
> >> static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
> >> enum fullness_group *fullness)
> >> {
> >>@@ -433,39 +438,86 @@ static struct page *get_next_page(struct page *page)
> >> return next;
> >> }
> >>
> >>-/* Encode <page, obj_idx> as a single handle value */
> >>-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
> >>+static struct page *get_prev_page(struct page *page)
> >> {
> >>- unsigned long handle;
> >>+ struct page *prev, *first_page;
> >>
> >>- if (!page) {
> >>- BUG_ON(obj_idx);
> >>- return NULL;
> >>- }
> >>+ first_page = get_first_page(page);
> >>+ if (page == first_page)
> >>+ prev = NULL;
> >>+ else if (page == (struct page *)first_page->private)
> >>+ prev = first_page;
> >>+ else
> >>+ prev = list_entry(page->lru.prev, struct page, lru);
> >>
> >>- handle = page_to_pfn(page) << OBJ_INDEX_BITS;
> >>- handle |= (obj_idx & OBJ_INDEX_MASK);
> >>+ return prev;
> >>
> >>- return (void *)handle;
> >> }
> >>
> >>-/* Decode <page, obj_idx> pair from the given object handle */
> >>-static void obj_handle_to_location(unsigned long handle, struct page **page,
> >>- unsigned long *obj_idx)
> >>+static void *encode_ptr(struct page *page, unsigned long offset)
> >> {
> >>- *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
> >>- *obj_idx = handle & OBJ_INDEX_MASK;
> >>+ unsigned long ptr;
> >>+ ptr = page_to_pfn(page) << OFFSET_BITS;
> >>+ ptr |= offset & OFFSET_MASK;
> >>+ return (void *)ptr;
> >>+}
> >>+
> >>+static void decode_ptr(unsigned long ptr, struct page **page,
> >>+ unsigned int *offset)
> >>+{
> >>+ *page = pfn_to_page(ptr >> OFFSET_BITS);
> >>+ *offset = ptr & OFFSET_MASK;
> >>+}
> >>+
> >>+static struct page *obj_handle_to_page(unsigned long handle)
> >>+{
> >>+ struct page *page;
> >>+ unsigned int offset;
> >>+
> >>+ decode_ptr(handle, &page, &offset);
> >>+ if (offset < get_page_index(page))
> >>+ page = get_prev_page(page);
> >>+
> >>+ return page;
> >>+}
> >>+
> >>+static unsigned int obj_handle_to_offset(unsigned long handle,
> >>+ unsigned int class_size)
> >>+{
> >>+ struct page *page;
> >>+ unsigned int offset;
> >>+
> >>+ decode_ptr(handle, &page, &offset);
> >>+ if (offset < get_page_index(page))
> >>+ offset = PAGE_SIZE - class_size + get_page_index(page);
> >
> >Althoug it's trivial, we can reduce get_page_index calling.
> >
> >>+ else
> >>+ offset = roundup(offset, class_size) - class_size;
> >>+
> >>+ return offset;
> >> }
> >>
> >>-static unsigned long obj_idx_to_offset(struct page *page,
> >>- unsigned long obj_idx, int class_size)
> >>+/* Encode <page, offset, size> as a single handle value */
> >
> >Need more kind comment about encoding scheme with obj's end <page, offset>
> >
> >>+static void *obj_location_to_handle(struct page *page, unsigned int offset,
> >>+ unsigned int size, unsigned int class_size)
> >> {
> >>- unsigned long off = 0;
> >>+ struct page *endpage;
> >>+ unsigned int endoffset;
> >>
> >>- if (!is_first_page(page))
> >>- off = page->index;
> >>+ if (!page) {
> >>+ BUG_ON(offset);
> >>+ return NULL;
> >>+ }
> >
> >What do you expect to catch with above check?
> >
> >>+ BUG_ON(offset >= PAGE_SIZE);
> >>+
> >>+ endpage = page;
> >>+ endoffset = offset + size - 1;
> >>+ if (endoffset >= PAGE_SIZE) {
> >>+ endpage = get_next_page(page);
> >>+ BUG_ON(!endpage);
> >>+ endoffset -= PAGE_SIZE;
> >>+ }
> >>
> >>- return off + obj_idx * class_size;
> >>+ return encode_ptr(endpage, endoffset);
> >> }
> >>
> >> static void reset_page(struct page *page)
> >>@@ -506,14 +558,13 @@ static void free_zspage(struct page *first_page)
> >> /* Initialize a newly allocated zspage */
> >> static void init_zspage(struct page *first_page, struct size_class *class)
> >> {
> >>- unsigned long off = 0;
> >>+ unsigned long off = 0, next_off = 0;
> >> struct page *page = first_page;
> >>
> >> BUG_ON(!is_first_page(first_page));
> >> while (page) {
> >> struct page *next_page;
> >> struct link_free *link;
> >>- unsigned int i, objs_on_page;
> >>
> >> /*
> >> * page->index stores offset of first object starting
> >>@@ -526,14 +577,12 @@ static void init_zspage(struct page *first_page, struct size_class *class)
> >>
> >> link = (struct link_free *)kmap_atomic(page) +
> >> off / sizeof(*link);
> >>- objs_on_page = (PAGE_SIZE - off) / class->size;
> >>
> >>- for (i = 1; i <= objs_on_page; i++) {
> >>- off += class->size;
> >>- if (off < PAGE_SIZE) {
> >>- link->next = obj_location_to_handle(page, i);
> >>- link += class->size / sizeof(*link);
> >>- }
> >>+ next_off = off + class->size;
> >>+ while (next_off < PAGE_SIZE) {
> >>+ link->next = encode_ptr(page, next_off);
> >>+ link += class->size / sizeof(*link);
> >>+ next_off += class->size;
> >> }
> >>
> >> /*
> >>@@ -542,10 +591,11 @@ static void init_zspage(struct page *first_page, struct size_class *class)
> >> * page (if present)
> >> */
> >> next_page = get_next_page(page);
> >>- link->next = obj_location_to_handle(next_page, 0);
> >>+ next_off = next_page ? next_off - PAGE_SIZE : 0;
> >>+ link->next = encode_ptr(next_page, next_off);
> >> kunmap_atomic(link);
> >> page = next_page;
> >>- off = (off + class->size) % PAGE_SIZE;
> >>+ off = next_off;
> >> }
> >> }
> >>
> >>@@ -596,7 +646,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
> >>
> >> init_zspage(first_page, class);
> >>
> >>- first_page->freelist = obj_location_to_handle(first_page, 0);
> >>+ first_page->freelist = encode_ptr(first_page, 0);
> >> /* Maximum number of objects we can store in this zspage */
> >> first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
> >>
> >>@@ -871,7 +921,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >> struct size_class *class;
> >>
> >> struct page *first_page, *m_page;
> >>- unsigned long m_objidx, m_offset;
> >>+ unsigned int m_offset;
> >>
> >> if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
> >> return 0;
> >>@@ -895,8 +945,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >> }
> >>
> >> obj = (unsigned long)first_page->freelist;
> >>- obj_handle_to_location(obj, &m_page, &m_objidx);
> >>- m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
> >>+ decode_ptr(obj, &m_page, &m_offset);
> >>
> >> link = (struct link_free *)kmap_atomic(m_page) +
> >> m_offset / sizeof(*link);
> >>@@ -907,6 +956,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >> first_page->inuse++;
> >> /* Now move the zspage to another fullness group, if required */
> >> fix_fullness_group(pool, first_page);
> >>+
> >>+ obj = (unsigned long)obj_location_to_handle(m_page, m_offset,
> >>+ size, class->size);
> >> spin_unlock(&class->lock);
> >>
> >> return obj;
> >>@@ -917,7 +969,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> >> {
> >> struct link_free *link;
> >> struct page *first_page, *f_page;
> >>- unsigned long f_objidx, f_offset;
> >>+ unsigned long f_offset;
> >>
> >> int class_idx;
> >> struct size_class *class;
> >>@@ -926,12 +978,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> >> if (unlikely(!obj))
> >> return;
> >>
> >>- obj_handle_to_location(obj, &f_page, &f_objidx);
> >>+ f_page = obj_handle_to_page(obj);
> >> first_page = get_first_page(f_page);
> >>
> >> get_zspage_mapping(first_page, &class_idx, &fullness);
> >> class = &pool->size_class[class_idx];
> >>- f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
> >>+ f_offset = obj_handle_to_offset(obj, class->size);
> >>
> >> spin_lock(&class->lock);
> >>
> >>@@ -940,7 +992,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> >> + f_offset);
> >> link->next = first_page->freelist;
> >> kunmap_atomic(link);
> >>- first_page->freelist = (void *)obj;
> >>+ first_page->freelist = encode_ptr(f_page, f_offset);
> >>
> >> first_page->inuse--;
> >> fullness = fix_fullness_group(pool, first_page);
> >>@@ -970,10 +1022,10 @@ EXPORT_SYMBOL_GPL(zs_free);
> >> * This function returns with preemption and page faults disabled.
> >> */
> >> void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> >>- enum zs_mapmode mm)
> >>+ enum zs_mapmode mm)
> >> {
> >> struct page *page;
> >>- unsigned long obj_idx, off;
> >>+ unsigned long off;
> >>
> >> unsigned int class_idx;
> >> enum fullness_group fg;
> >>@@ -990,10 +1042,10 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> >> */
> >> BUG_ON(in_interrupt());
> >>
> >>- obj_handle_to_location(handle, &page, &obj_idx);
> >>+ page = obj_handle_to_page(handle);
> >> get_zspage_mapping(get_first_page(page), &class_idx, &fg);
> >> class = &pool->size_class[class_idx];
> >>- off = obj_idx_to_offset(page, obj_idx, class->size);
> >>+ off = obj_handle_to_offset(handle, class->size);
> >>
> >> area = &get_cpu_var(zs_map_area);
> >> area->vm_mm = mm;
> >>@@ -1015,7 +1067,7 @@ EXPORT_SYMBOL_GPL(zs_map_object);
> >> void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
> >> {
> >> struct page *page;
> >>- unsigned long obj_idx, off;
> >>+ unsigned long off;
> >>
> >> unsigned int class_idx;
> >> enum fullness_group fg;
> >>@@ -1024,10 +1076,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
> >>
> >> BUG_ON(!handle);
> >>
> >>- obj_handle_to_location(handle, &page, &obj_idx);
> >>+ page = obj_handle_to_page(handle);
> >> get_zspage_mapping(get_first_page(page), &class_idx, &fg);
> >> class = &pool->size_class[class_idx];
> >>- off = obj_idx_to_offset(page, obj_idx, class->size);
> >>+ off = obj_handle_to_offset(handle, class->size);
> >>
> >> area = &__get_cpu_var(zs_map_area);
> >> if (off + class->size <= PAGE_SIZE)
> >>@@ -1045,6 +1097,29 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
> >> }
> >> EXPORT_SYMBOL_GPL(zs_unmap_object);
> >>
> >>+size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle)
> >>+{
> >>+ struct page *endpage;
> >>+ unsigned int endoffset, size;
> >>+
> >>+ unsigned int class_idx;
> >>+ enum fullness_group fg;
> >>+ struct size_class *class;
> >>+
> >>+ decode_ptr(handle, &endpage, &endoffset);
> >>+ get_zspage_mapping(endpage, &class_idx, &fg);
> >>+ class = &pool->size_class[class_idx];
> >>+
> >>+ size = endoffset + 1;
> >>+ if (endoffset < get_page_index(endpage))
> >>+ size += class->size - get_page_index(endpage);
> >>+ else
> >>+ size -= rounddown(endoffset, class->size);
> >>+
> >>+ return size;
> >>+}
> >>+EXPORT_SYMBOL_GPL(zs_get_object_size);
> >>+
> >> u64 zs_get_total_size_bytes(struct zs_pool *pool)
> >> {
> >> int i;
> >>diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/drivers/staging/zsmalloc/zsmalloc.h
> >>index de2e8bf..2830fdf 100644
> >>--- a/drivers/staging/zsmalloc/zsmalloc.h
> >>+++ b/drivers/staging/zsmalloc/zsmalloc.h
> >>@@ -38,6 +38,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> >> enum zs_mapmode mm);
> >> void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
> >>
> >>+size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle);
> >> u64 zs_get_total_size_bytes(struct zs_pool *pool);
> >>
> >> #endif
> >>--
> >>1.7.10.4
> >>
> >>--
> >>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >>the body of a message to [email protected]
> >>More majordomo info at http://vger.kernel.org/majordomo-info.html
> >>Please read the FAQ at http://www.tux.org/lkml/
> >
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Kind regards,
Minchan Kim

2012-11-30 00:53:18

by Nitin Gupta

[permalink] [raw]
Subject: Re: [PATCH 1/2] zsmalloc: add function to query object size

On Thu, Nov 29, 2012 at 4:03 PM, Minchan Kim <[email protected]> wrote:
> On Thu, Nov 29, 2012 at 01:09:56AM -0800, Nitin Gupta wrote:
>> On 11/28/2012 11:45 PM, Minchan Kim wrote:
>> >On Mon, Nov 26, 2012 at 11:26:40PM -0800, Nitin Gupta wrote:
>> >>Adds zs_get_object_size(handle) which provides the size of
>> >>the given object. This is useful since the user (zram etc.)
>> >>now do not have to maintain object sizes separately, saving
>> >>on some metadata size (4b per page).
>> >>
>> >>The object handle encodes <page, offset> pair which currently points
>> >
>> >Nitpick.
>> >
>> ><page, index> in descrption would be proper instead of
>> ><page, offset>. You are going to replace <page, idx> with <page, offset>.
>> >
>>
>> I think 'offset' conveys the meaning more clearly; 'index' is after
>> all just a chopped-off version of offset :)
>
> In my perceptoin, offset means location from some point by some byte
> while index is thing we have to multiply sizeof(object) to get.
> So you used index before the patch but now you try to use offset instead of
> index.
>
> Anyway, it's minor nitpick. Never mind if you don't agree. :)
>
>>
>>
>> >>to the start of the object. Now, the handle implicitly stores the size
>> >>information by pointing to the object's end instead. Since zsmalloc is
>> >>a slab based allocator, the start of the object can be easily determined
>> >>and the difference between the end offset encoded in the handle and the
>> >>start gives us the object size.
>> >
>> >It's a good idea. Look at just minor comment below.
>> >
>> >Let's talk with another concern. This patch surely helps current
>> >customer's memory usage who should add 4 byte for accounting the
>> >statistics while zsmalloc could be slow down.
>> >Is it really valuable?
>> >
>> >Yes. zram/zcache had a tight coupling with zsmalloc so it already
>> >lost modularity. :( In this POV, this patch makes sense.
>> >But if someone are willing to remove statistics for performance?
>> >Although he remove it, zsmalloc is still slow down.
>> >
>> >Statistic for user of zsmalloc should be cost of user himself, not zsmalloc
>> >and it accelerates dependency with customer so it makes changing allocator
>> >hard in future. We already had such experience(xvmalloc->zsmalloc). Of course,
>> >it's not good that worry future things too early without any plan.
>> >So I'm not strong againt you. If any reviewer don't raise an eyebrow,
>> >I wil rely on your decision.
>> >
>>
>> Looking over the changes I do not expect any difference in
>> performance -- just a bit more arithmetic, however the use of
>> get_prev_page() which may dereference a few extra pointers might not
>> be really free. Also, my iozone test[1] shows very little difference
>> in performance:
>
> Iozone test isn't enough to prove the minor slow down because it would have
> many noise about I/O path and different compression ratio per I/O.
>
>>
>> With just the fix for crash (patch 1/2):
>> 9.28user 1159.84system 21:46.54elapsed 89%CPU (0avgtext+0avgdata
>> 50200maxresident)k
>> 212988544inputs+190660816outputs (1major+16706minor)pagefaults 0swaps
>>
>> With the new get_object_size() code (patch 1/2 + patch 2/2):
>> 9.20user 1112.63system 21:03.61elapsed 88%CPU (0avgtext+0avgdata
>> 50200maxresident)k
>> 194636568inputs+190500424outputs (1major+16707minor)pagefaults 0swaps
>>
>> I really cannot explain this ~40s speedup but anyways, I think
>> optimizing zsmalloc/zram should be taken up separately, at least
>> when this new code does not seem to have any significant effects.
>>
>>
>> [1] iozone test:
>> - Create zram of size 1200m
>> - Create ext4 fs
>> - iozone -a -g 1G
>> - In parallel: watch zram_stress
>>
>> # zram_stress
>> sync
>> echo 1 | sudo tee /proc/sys/vm/drop_caches
>>
>>
>> Surely, not the most accurate of the tests but gives an idea if
>> anything is making a significant difference.
>
> For more accurate test, it would be better to use zsmapbench by Seth.
> https://github.com/spartacus06/zsmapbench

Thanks for the pointer. I also found fio which can generate I/O with different
levels of compressibility. These would help with future evaluations.

> Frankly speaking, I don't expect it has a significant regression as you said.
> More concern to me is we are going to make tight coupling with zram/zcache +
> zsmalloc. It makes changing from zsmalloc to smarter allocator hard.

I could not understand how this patch increasing zram + zsmalloc coupling.
All we need from any potential allocator replacement is an interface to
provide the object size given its handle. If the new allocation cannot
support that then the we can create another size[] just like handle[] to
maintain sizes. So, its really simple to replace zsmalloc. if required, at
least for the zram.


> The reason I have a such concern is that I have a TODO which supports
> swap-over-zram pages migratin to storage swap when zram is full. Yes. It's a
> just plan, no schedule at the moment so I can't insist on but if i try it in
> future, I might want to replace zsmalloc to another. In case of that, tight
> coupling would be a another hurdle.

For me, its hard to see what kind of issues you might face during this
zram-to-disk migration work, especially from the allocator side. Anyways, this
would be an interesting feature, so please let me know if you get any issues
I can help with.

> But as I mentioned, I shouldn't prevent your great work by my ideal plan which
> even don't have real schedule. So still I can follow your opinion.
> Please resend the patch with fix if you think it's really worthy. :)
>

I didn't get it. Do you want any changes in this patch? or in patch 2/2?
In my opinion this patch should be included since you are getting get_size()
functionality for almost no cost.

Thanks,
Nitin


>>
>> >
>> >>
>> >>Signed-off-by: Nitin Gupta <[email protected]>
>> >>---
>> >> drivers/staging/zsmalloc/zsmalloc-main.c | 177 +++++++++++++++++++++---------
>> >> drivers/staging/zsmalloc/zsmalloc.h | 1 +
>> >> 2 files changed, 127 insertions(+), 51 deletions(-)
>> >>
>> >>diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c
>> >>index 09a9d35..65c9d3b 100644
>> >>--- a/drivers/staging/zsmalloc/zsmalloc-main.c
>> >>+++ b/drivers/staging/zsmalloc/zsmalloc-main.c
>> >>@@ -112,20 +112,20 @@
>> >> #define MAX_PHYSMEM_BITS 36
>> >> #else /* !CONFIG_HIGHMEM64G */
>> >> /*
>> >>- * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
>> >>+ * If this definition of MAX_PHYSMEM_BITS is used, OFFSET_BITS will just
>> >> * be PAGE_SHIFT
>> >> */
>> >> #define MAX_PHYSMEM_BITS BITS_PER_LONG
>> >> #endif
>> >> #endif
>> >> #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
>> >>-#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
>> >>-#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
>> >>+#define OFFSET_BITS (BITS_PER_LONG - _PFN_BITS)
>> >>+#define OFFSET_MASK ((_AC(1, UL) << OFFSET_BITS) - 1)
>> >>
>> >> #define MAX(a, b) ((a) >= (b) ? (a) : (b))
>> >> /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
>> >> #define ZS_MIN_ALLOC_SIZE \
>> >>- MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
>> >>+ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OFFSET_BITS))
>> >> #define ZS_MAX_ALLOC_SIZE PAGE_SIZE
>> >>
>> >> /*
>> >>@@ -256,6 +256,11 @@ static int is_last_page(struct page *page)
>> >> return PagePrivate2(page);
>> >> }
>> >>
>> >>+static unsigned long get_page_index(struct page *page)
>> >
>> >IMO, first_obj_offset(struct page *page) would be better readable.
>> >
>> >>+{
>> >>+ return is_first_page(page) ? 0 : page->index;
>> >>+}
>> >>+
>> >> static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
>> >> enum fullness_group *fullness)
>> >> {
>> >>@@ -433,39 +438,86 @@ static struct page *get_next_page(struct page *page)
>> >> return next;
>> >> }
>> >>
>> >>-/* Encode <page, obj_idx> as a single handle value */
>> >>-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
>> >>+static struct page *get_prev_page(struct page *page)
>> >> {
>> >>- unsigned long handle;
>> >>+ struct page *prev, *first_page;
>> >>
>> >>- if (!page) {
>> >>- BUG_ON(obj_idx);
>> >>- return NULL;
>> >>- }
>> >>+ first_page = get_first_page(page);
>> >>+ if (page == first_page)
>> >>+ prev = NULL;
>> >>+ else if (page == (struct page *)first_page->private)
>> >>+ prev = first_page;
>> >>+ else
>> >>+ prev = list_entry(page->lru.prev, struct page, lru);
>> >>
>> >>- handle = page_to_pfn(page) << OBJ_INDEX_BITS;
>> >>- handle |= (obj_idx & OBJ_INDEX_MASK);
>> >>+ return prev;
>> >>
>> >>- return (void *)handle;
>> >> }
>> >>
>> >>-/* Decode <page, obj_idx> pair from the given object handle */
>> >>-static void obj_handle_to_location(unsigned long handle, struct page **page,
>> >>- unsigned long *obj_idx)
>> >>+static void *encode_ptr(struct page *page, unsigned long offset)
>> >> {
>> >>- *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
>> >>- *obj_idx = handle & OBJ_INDEX_MASK;
>> >>+ unsigned long ptr;
>> >>+ ptr = page_to_pfn(page) << OFFSET_BITS;
>> >>+ ptr |= offset & OFFSET_MASK;
>> >>+ return (void *)ptr;
>> >>+}
>> >>+
>> >>+static void decode_ptr(unsigned long ptr, struct page **page,
>> >>+ unsigned int *offset)
>> >>+{
>> >>+ *page = pfn_to_page(ptr >> OFFSET_BITS);
>> >>+ *offset = ptr & OFFSET_MASK;
>> >>+}
>> >>+
>> >>+static struct page *obj_handle_to_page(unsigned long handle)
>> >>+{
>> >>+ struct page *page;
>> >>+ unsigned int offset;
>> >>+
>> >>+ decode_ptr(handle, &page, &offset);
>> >>+ if (offset < get_page_index(page))
>> >>+ page = get_prev_page(page);
>> >>+
>> >>+ return page;
>> >>+}
>> >>+
>> >>+static unsigned int obj_handle_to_offset(unsigned long handle,
>> >>+ unsigned int class_size)
>> >>+{
>> >>+ struct page *page;
>> >>+ unsigned int offset;
>> >>+
>> >>+ decode_ptr(handle, &page, &offset);
>> >>+ if (offset < get_page_index(page))
>> >>+ offset = PAGE_SIZE - class_size + get_page_index(page);
>> >
>> >Althoug it's trivial, we can reduce get_page_index calling.
>> >
>> >>+ else
>> >>+ offset = roundup(offset, class_size) - class_size;
>> >>+
>> >>+ return offset;
>> >> }
>> >>
>> >>-static unsigned long obj_idx_to_offset(struct page *page,
>> >>- unsigned long obj_idx, int class_size)
>> >>+/* Encode <page, offset, size> as a single handle value */
>> >
>> >Need more kind comment about encoding scheme with obj's end <page, offset>
>> >
>> >>+static void *obj_location_to_handle(struct page *page, unsigned int offset,
>> >>+ unsigned int size, unsigned int class_size)
>> >> {
>> >>- unsigned long off = 0;
>> >>+ struct page *endpage;
>> >>+ unsigned int endoffset;
>> >>
>> >>- if (!is_first_page(page))
>> >>- off = page->index;
>> >>+ if (!page) {
>> >>+ BUG_ON(offset);
>> >>+ return NULL;
>> >>+ }
>> >
>> >What do you expect to catch with above check?
>> >
>> >>+ BUG_ON(offset >= PAGE_SIZE);
>> >>+
>> >>+ endpage = page;
>> >>+ endoffset = offset + size - 1;
>> >>+ if (endoffset >= PAGE_SIZE) {
>> >>+ endpage = get_next_page(page);
>> >>+ BUG_ON(!endpage);
>> >>+ endoffset -= PAGE_SIZE;
>> >>+ }
>> >>
>> >>- return off + obj_idx * class_size;
>> >>+ return encode_ptr(endpage, endoffset);
>> >> }
>> >>
>> >> static void reset_page(struct page *page)
>> >>@@ -506,14 +558,13 @@ static void free_zspage(struct page *first_page)
>> >> /* Initialize a newly allocated zspage */
>> >> static void init_zspage(struct page *first_page, struct size_class *class)
>> >> {
>> >>- unsigned long off = 0;
>> >>+ unsigned long off = 0, next_off = 0;
>> >> struct page *page = first_page;
>> >>
>> >> BUG_ON(!is_first_page(first_page));
>> >> while (page) {
>> >> struct page *next_page;
>> >> struct link_free *link;
>> >>- unsigned int i, objs_on_page;
>> >>
>> >> /*
>> >> * page->index stores offset of first object starting
>> >>@@ -526,14 +577,12 @@ static void init_zspage(struct page *first_page, struct size_class *class)
>> >>
>> >> link = (struct link_free *)kmap_atomic(page) +
>> >> off / sizeof(*link);
>> >>- objs_on_page = (PAGE_SIZE - off) / class->size;
>> >>
>> >>- for (i = 1; i <= objs_on_page; i++) {
>> >>- off += class->size;
>> >>- if (off < PAGE_SIZE) {
>> >>- link->next = obj_location_to_handle(page, i);
>> >>- link += class->size / sizeof(*link);
>> >>- }
>> >>+ next_off = off + class->size;
>> >>+ while (next_off < PAGE_SIZE) {
>> >>+ link->next = encode_ptr(page, next_off);
>> >>+ link += class->size / sizeof(*link);
>> >>+ next_off += class->size;
>> >> }
>> >>
>> >> /*
>> >>@@ -542,10 +591,11 @@ static void init_zspage(struct page *first_page, struct size_class *class)
>> >> * page (if present)
>> >> */
>> >> next_page = get_next_page(page);
>> >>- link->next = obj_location_to_handle(next_page, 0);
>> >>+ next_off = next_page ? next_off - PAGE_SIZE : 0;
>> >>+ link->next = encode_ptr(next_page, next_off);
>> >> kunmap_atomic(link);
>> >> page = next_page;
>> >>- off = (off + class->size) % PAGE_SIZE;
>> >>+ off = next_off;
>> >> }
>> >> }
>> >>
>> >>@@ -596,7 +646,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
>> >>
>> >> init_zspage(first_page, class);
>> >>
>> >>- first_page->freelist = obj_location_to_handle(first_page, 0);
>> >>+ first_page->freelist = encode_ptr(first_page, 0);
>> >> /* Maximum number of objects we can store in this zspage */
>> >> first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
>> >>
>> >>@@ -871,7 +921,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>> >> struct size_class *class;
>> >>
>> >> struct page *first_page, *m_page;
>> >>- unsigned long m_objidx, m_offset;
>> >>+ unsigned int m_offset;
>> >>
>> >> if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
>> >> return 0;
>> >>@@ -895,8 +945,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>> >> }
>> >>
>> >> obj = (unsigned long)first_page->freelist;
>> >>- obj_handle_to_location(obj, &m_page, &m_objidx);
>> >>- m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
>> >>+ decode_ptr(obj, &m_page, &m_offset);
>> >>
>> >> link = (struct link_free *)kmap_atomic(m_page) +
>> >> m_offset / sizeof(*link);
>> >>@@ -907,6 +956,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
>> >> first_page->inuse++;
>> >> /* Now move the zspage to another fullness group, if required */
>> >> fix_fullness_group(pool, first_page);
>> >>+
>> >>+ obj = (unsigned long)obj_location_to_handle(m_page, m_offset,
>> >>+ size, class->size);
>> >> spin_unlock(&class->lock);
>> >>
>> >> return obj;
>> >>@@ -917,7 +969,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
>> >> {
>> >> struct link_free *link;
>> >> struct page *first_page, *f_page;
>> >>- unsigned long f_objidx, f_offset;
>> >>+ unsigned long f_offset;
>> >>
>> >> int class_idx;
>> >> struct size_class *class;
>> >>@@ -926,12 +978,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
>> >> if (unlikely(!obj))
>> >> return;
>> >>
>> >>- obj_handle_to_location(obj, &f_page, &f_objidx);
>> >>+ f_page = obj_handle_to_page(obj);
>> >> first_page = get_first_page(f_page);
>> >>
>> >> get_zspage_mapping(first_page, &class_idx, &fullness);
>> >> class = &pool->size_class[class_idx];
>> >>- f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
>> >>+ f_offset = obj_handle_to_offset(obj, class->size);
>> >>
>> >> spin_lock(&class->lock);
>> >>
>> >>@@ -940,7 +992,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
>> >> + f_offset);
>> >> link->next = first_page->freelist;
>> >> kunmap_atomic(link);
>> >>- first_page->freelist = (void *)obj;
>> >>+ first_page->freelist = encode_ptr(f_page, f_offset);
>> >>
>> >> first_page->inuse--;
>> >> fullness = fix_fullness_group(pool, first_page);
>> >>@@ -970,10 +1022,10 @@ EXPORT_SYMBOL_GPL(zs_free);
>> >> * This function returns with preemption and page faults disabled.
>> >> */
>> >> void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>> >>- enum zs_mapmode mm)
>> >>+ enum zs_mapmode mm)
>> >> {
>> >> struct page *page;
>> >>- unsigned long obj_idx, off;
>> >>+ unsigned long off;
>> >>
>> >> unsigned int class_idx;
>> >> enum fullness_group fg;
>> >>@@ -990,10 +1042,10 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>> >> */
>> >> BUG_ON(in_interrupt());
>> >>
>> >>- obj_handle_to_location(handle, &page, &obj_idx);
>> >>+ page = obj_handle_to_page(handle);
>> >> get_zspage_mapping(get_first_page(page), &class_idx, &fg);
>> >> class = &pool->size_class[class_idx];
>> >>- off = obj_idx_to_offset(page, obj_idx, class->size);
>> >>+ off = obj_handle_to_offset(handle, class->size);
>> >>
>> >> area = &get_cpu_var(zs_map_area);
>> >> area->vm_mm = mm;
>> >>@@ -1015,7 +1067,7 @@ EXPORT_SYMBOL_GPL(zs_map_object);
>> >> void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>> >> {
>> >> struct page *page;
>> >>- unsigned long obj_idx, off;
>> >>+ unsigned long off;
>> >>
>> >> unsigned int class_idx;
>> >> enum fullness_group fg;
>> >>@@ -1024,10 +1076,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>> >>
>> >> BUG_ON(!handle);
>> >>
>> >>- obj_handle_to_location(handle, &page, &obj_idx);
>> >>+ page = obj_handle_to_page(handle);
>> >> get_zspage_mapping(get_first_page(page), &class_idx, &fg);
>> >> class = &pool->size_class[class_idx];
>> >>- off = obj_idx_to_offset(page, obj_idx, class->size);
>> >>+ off = obj_handle_to_offset(handle, class->size);
>> >>
>> >> area = &__get_cpu_var(zs_map_area);
>> >> if (off + class->size <= PAGE_SIZE)
>> >>@@ -1045,6 +1097,29 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>> >> }
>> >> EXPORT_SYMBOL_GPL(zs_unmap_object);
>> >>
>> >>+size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle)
>> >>+{
>> >>+ struct page *endpage;
>> >>+ unsigned int endoffset, size;
>> >>+
>> >>+ unsigned int class_idx;
>> >>+ enum fullness_group fg;
>> >>+ struct size_class *class;
>> >>+
>> >>+ decode_ptr(handle, &endpage, &endoffset);
>> >>+ get_zspage_mapping(endpage, &class_idx, &fg);
>> >>+ class = &pool->size_class[class_idx];
>> >>+
>> >>+ size = endoffset + 1;
>> >>+ if (endoffset < get_page_index(endpage))
>> >>+ size += class->size - get_page_index(endpage);
>> >>+ else
>> >>+ size -= rounddown(endoffset, class->size);
>> >>+
>> >>+ return size;
>> >>+}
>> >>+EXPORT_SYMBOL_GPL(zs_get_object_size);
>> >>+
>> >> u64 zs_get_total_size_bytes(struct zs_pool *pool)
>> >> {
>> >> int i;
>> >>diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/drivers/staging/zsmalloc/zsmalloc.h
>> >>index de2e8bf..2830fdf 100644
>> >>--- a/drivers/staging/zsmalloc/zsmalloc.h
>> >>+++ b/drivers/staging/zsmalloc/zsmalloc.h
>> >>@@ -38,6 +38,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>> >> enum zs_mapmode mm);
>> >> void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
>> >>
>> >>+size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle);
>> >> u64 zs_get_total_size_bytes(struct zs_pool *pool);
>> >>
>> >> #endif
>> >>--
>> >>1.7.10.4
>> >>
>> >>--
>> >>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> >>the body of a message to [email protected]
>> >>More majordomo info at http://vger.kernel.org/majordomo-info.html
>> >>Please read the FAQ at http://www.tux.org/lkml/
>> >
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>
> --
> Kind regards,
> Minchan Kim

2012-11-30 01:58:15

by Minchan Kim

[permalink] [raw]
Subject: Re: [PATCH 1/2] zsmalloc: add function to query object size

On Thu, Nov 29, 2012 at 04:53:15PM -0800, Nitin Gupta wrote:
> On Thu, Nov 29, 2012 at 4:03 PM, Minchan Kim <[email protected]> wrote:
> > On Thu, Nov 29, 2012 at 01:09:56AM -0800, Nitin Gupta wrote:
> >> On 11/28/2012 11:45 PM, Minchan Kim wrote:
> >> >On Mon, Nov 26, 2012 at 11:26:40PM -0800, Nitin Gupta wrote:
> >> >>Adds zs_get_object_size(handle) which provides the size of
> >> >>the given object. This is useful since the user (zram etc.)
> >> >>now do not have to maintain object sizes separately, saving
> >> >>on some metadata size (4b per page).
> >> >>
> >> >>The object handle encodes <page, offset> pair which currently points
> >> >
> >> >Nitpick.
> >> >
> >> ><page, index> in descrption would be proper instead of
> >> ><page, offset>. You are going to replace <page, idx> with <page, offset>.
> >> >
> >>
> >> I think 'offset' conveys the meaning more clearly; 'index' is after
> >> all just a chopped-off version of offset :)
> >
> > In my perceptoin, offset means location from some point by some byte
> > while index is thing we have to multiply sizeof(object) to get.
> > So you used index before the patch but now you try to use offset instead of
> > index.
> >
> > Anyway, it's minor nitpick. Never mind if you don't agree. :)
> >
> >>
> >>
> >> >>to the start of the object. Now, the handle implicitly stores the size
> >> >>information by pointing to the object's end instead. Since zsmalloc is
> >> >>a slab based allocator, the start of the object can be easily determined
> >> >>and the difference between the end offset encoded in the handle and the
> >> >>start gives us the object size.
> >> >
> >> >It's a good idea. Look at just minor comment below.
> >> >
> >> >Let's talk with another concern. This patch surely helps current
> >> >customer's memory usage who should add 4 byte for accounting the
> >> >statistics while zsmalloc could be slow down.
> >> >Is it really valuable?
> >> >
> >> >Yes. zram/zcache had a tight coupling with zsmalloc so it already
> >> >lost modularity. :( In this POV, this patch makes sense.
> >> >But if someone are willing to remove statistics for performance?
> >> >Although he remove it, zsmalloc is still slow down.
> >> >
> >> >Statistic for user of zsmalloc should be cost of user himself, not zsmalloc
> >> >and it accelerates dependency with customer so it makes changing allocator
> >> >hard in future. We already had such experience(xvmalloc->zsmalloc). Of course,
> >> >it's not good that worry future things too early without any plan.
> >> >So I'm not strong againt you. If any reviewer don't raise an eyebrow,
> >> >I wil rely on your decision.
> >> >
> >>
> >> Looking over the changes I do not expect any difference in
> >> performance -- just a bit more arithmetic, however the use of
> >> get_prev_page() which may dereference a few extra pointers might not
> >> be really free. Also, my iozone test[1] shows very little difference
> >> in performance:
> >
> > Iozone test isn't enough to prove the minor slow down because it would have
> > many noise about I/O path and different compression ratio per I/O.
> >
> >>
> >> With just the fix for crash (patch 1/2):
> >> 9.28user 1159.84system 21:46.54elapsed 89%CPU (0avgtext+0avgdata
> >> 50200maxresident)k
> >> 212988544inputs+190660816outputs (1major+16706minor)pagefaults 0swaps
> >>
> >> With the new get_object_size() code (patch 1/2 + patch 2/2):
> >> 9.20user 1112.63system 21:03.61elapsed 88%CPU (0avgtext+0avgdata
> >> 50200maxresident)k
> >> 194636568inputs+190500424outputs (1major+16707minor)pagefaults 0swaps
> >>
> >> I really cannot explain this ~40s speedup but anyways, I think
> >> optimizing zsmalloc/zram should be taken up separately, at least
> >> when this new code does not seem to have any significant effects.
> >>
> >>
> >> [1] iozone test:
> >> - Create zram of size 1200m
> >> - Create ext4 fs
> >> - iozone -a -g 1G
> >> - In parallel: watch zram_stress
> >>
> >> # zram_stress
> >> sync
> >> echo 1 | sudo tee /proc/sys/vm/drop_caches
> >>
> >>
> >> Surely, not the most accurate of the tests but gives an idea if
> >> anything is making a significant difference.
> >
> > For more accurate test, it would be better to use zsmapbench by Seth.
> > https://github.com/spartacus06/zsmapbench
>
> Thanks for the pointer. I also found fio which can generate I/O with different
> levels of compressibility. These would help with future evaluations.
>
> > Frankly speaking, I don't expect it has a significant regression as you said.
> > More concern to me is we are going to make tight coupling with zram/zcache +
> > zsmalloc. It makes changing from zsmalloc to smarter allocator hard.
>
> I could not understand how this patch increasing zram + zsmalloc coupling.
> All we need from any potential allocator replacement is an interface to
> provide the object size given its handle. If the new allocation cannot

That's it. Why should general allocator support such special function?

> support that then the we can create another size[] just like handle[] to
> maintain sizes. So, its really simple to replace zsmalloc. if required, at
> least for the zram.

You're right. It's simple now but the concern is that we are approaching to
put the specific function which should be customer side to general allocator
side. Although it's simple at the moment, it could be severe if such special
functionalitys are accumulated.

>
>
> > The reason I have a such concern is that I have a TODO which supports
> > swap-over-zram pages migratin to storage swap when zram is full. Yes. It's a
> > just plan, no schedule at the moment so I can't insist on but if i try it in
> > future, I might want to replace zsmalloc to another. In case of that, tight
> > coupling would be a another hurdle.
>
> For me, its hard to see what kind of issues you might face during this
> zram-to-disk migration work, especially from the allocator side. Anyways, this

The reason is that I want to make free pages via such migration when memory pressure
is very severe. For it, we need some scheme to select victm zram objects like LRU.
In case of that, although we migrate them, we can't get a free page if they are
spread over lots of other zram pages. It's already problem in slab-based allocator.

If I should develop a allocator for preventing above my concern, I never want to
provide getting objsize function from new allocator.


> would be an interesting feature, so please let me know if you get any issues
> I can help with.
>
> > But as I mentioned, I shouldn't prevent your great work by my ideal plan which
> > even don't have real schedule. So still I can follow your opinion.
> > Please resend the patch with fix if you think it's really worthy. :)
> >
>
> I didn't get it. Do you want any changes in this patch? or in patch 2/2?
> In my opinion this patch should be included since you are getting get_size()
> functionality for almost no cost.

Hmm, You mentioned like below , so I guess you found some bug. No?
With just the fix for crash (patch 1/2):

>
> Thanks,
> Nitin
>
>
> >>
> >> >
> >> >>
> >> >>Signed-off-by: Nitin Gupta <[email protected]>
> >> >>---
> >> >> drivers/staging/zsmalloc/zsmalloc-main.c | 177 +++++++++++++++++++++---------
> >> >> drivers/staging/zsmalloc/zsmalloc.h | 1 +
> >> >> 2 files changed, 127 insertions(+), 51 deletions(-)
> >> >>
> >> >>diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c
> >> >>index 09a9d35..65c9d3b 100644
> >> >>--- a/drivers/staging/zsmalloc/zsmalloc-main.c
> >> >>+++ b/drivers/staging/zsmalloc/zsmalloc-main.c
> >> >>@@ -112,20 +112,20 @@
> >> >> #define MAX_PHYSMEM_BITS 36
> >> >> #else /* !CONFIG_HIGHMEM64G */
> >> >> /*
> >> >>- * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
> >> >>+ * If this definition of MAX_PHYSMEM_BITS is used, OFFSET_BITS will just
> >> >> * be PAGE_SHIFT
> >> >> */
> >> >> #define MAX_PHYSMEM_BITS BITS_PER_LONG
> >> >> #endif
> >> >> #endif
> >> >> #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
> >> >>-#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
> >> >>-#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
> >> >>+#define OFFSET_BITS (BITS_PER_LONG - _PFN_BITS)
> >> >>+#define OFFSET_MASK ((_AC(1, UL) << OFFSET_BITS) - 1)
> >> >>
> >> >> #define MAX(a, b) ((a) >= (b) ? (a) : (b))
> >> >> /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
> >> >> #define ZS_MIN_ALLOC_SIZE \
> >> >>- MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
> >> >>+ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OFFSET_BITS))
> >> >> #define ZS_MAX_ALLOC_SIZE PAGE_SIZE
> >> >>
> >> >> /*
> >> >>@@ -256,6 +256,11 @@ static int is_last_page(struct page *page)
> >> >> return PagePrivate2(page);
> >> >> }
> >> >>
> >> >>+static unsigned long get_page_index(struct page *page)
> >> >
> >> >IMO, first_obj_offset(struct page *page) would be better readable.
> >> >
> >> >>+{
> >> >>+ return is_first_page(page) ? 0 : page->index;
> >> >>+}
> >> >>+
> >> >> static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
> >> >> enum fullness_group *fullness)
> >> >> {
> >> >>@@ -433,39 +438,86 @@ static struct page *get_next_page(struct page *page)
> >> >> return next;
> >> >> }
> >> >>
> >> >>-/* Encode <page, obj_idx> as a single handle value */
> >> >>-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
> >> >>+static struct page *get_prev_page(struct page *page)
> >> >> {
> >> >>- unsigned long handle;
> >> >>+ struct page *prev, *first_page;
> >> >>
> >> >>- if (!page) {
> >> >>- BUG_ON(obj_idx);
> >> >>- return NULL;
> >> >>- }
> >> >>+ first_page = get_first_page(page);
> >> >>+ if (page == first_page)
> >> >>+ prev = NULL;
> >> >>+ else if (page == (struct page *)first_page->private)
> >> >>+ prev = first_page;
> >> >>+ else
> >> >>+ prev = list_entry(page->lru.prev, struct page, lru);
> >> >>
> >> >>- handle = page_to_pfn(page) << OBJ_INDEX_BITS;
> >> >>- handle |= (obj_idx & OBJ_INDEX_MASK);
> >> >>+ return prev;
> >> >>
> >> >>- return (void *)handle;
> >> >> }
> >> >>
> >> >>-/* Decode <page, obj_idx> pair from the given object handle */
> >> >>-static void obj_handle_to_location(unsigned long handle, struct page **page,
> >> >>- unsigned long *obj_idx)
> >> >>+static void *encode_ptr(struct page *page, unsigned long offset)
> >> >> {
> >> >>- *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
> >> >>- *obj_idx = handle & OBJ_INDEX_MASK;
> >> >>+ unsigned long ptr;
> >> >>+ ptr = page_to_pfn(page) << OFFSET_BITS;
> >> >>+ ptr |= offset & OFFSET_MASK;
> >> >>+ return (void *)ptr;
> >> >>+}
> >> >>+
> >> >>+static void decode_ptr(unsigned long ptr, struct page **page,
> >> >>+ unsigned int *offset)
> >> >>+{
> >> >>+ *page = pfn_to_page(ptr >> OFFSET_BITS);
> >> >>+ *offset = ptr & OFFSET_MASK;
> >> >>+}
> >> >>+
> >> >>+static struct page *obj_handle_to_page(unsigned long handle)
> >> >>+{
> >> >>+ struct page *page;
> >> >>+ unsigned int offset;
> >> >>+
> >> >>+ decode_ptr(handle, &page, &offset);
> >> >>+ if (offset < get_page_index(page))
> >> >>+ page = get_prev_page(page);
> >> >>+
> >> >>+ return page;
> >> >>+}
> >> >>+
> >> >>+static unsigned int obj_handle_to_offset(unsigned long handle,
> >> >>+ unsigned int class_size)
> >> >>+{
> >> >>+ struct page *page;
> >> >>+ unsigned int offset;
> >> >>+
> >> >>+ decode_ptr(handle, &page, &offset);
> >> >>+ if (offset < get_page_index(page))
> >> >>+ offset = PAGE_SIZE - class_size + get_page_index(page);
> >> >
> >> >Althoug it's trivial, we can reduce get_page_index calling.
> >> >
> >> >>+ else
> >> >>+ offset = roundup(offset, class_size) - class_size;
> >> >>+
> >> >>+ return offset;
> >> >> }
> >> >>
> >> >>-static unsigned long obj_idx_to_offset(struct page *page,
> >> >>- unsigned long obj_idx, int class_size)
> >> >>+/* Encode <page, offset, size> as a single handle value */
> >> >
> >> >Need more kind comment about encoding scheme with obj's end <page, offset>
> >> >
> >> >>+static void *obj_location_to_handle(struct page *page, unsigned int offset,
> >> >>+ unsigned int size, unsigned int class_size)
> >> >> {
> >> >>- unsigned long off = 0;
> >> >>+ struct page *endpage;
> >> >>+ unsigned int endoffset;
> >> >>
> >> >>- if (!is_first_page(page))
> >> >>- off = page->index;
> >> >>+ if (!page) {
> >> >>+ BUG_ON(offset);
> >> >>+ return NULL;
> >> >>+ }
> >> >
> >> >What do you expect to catch with above check?
> >> >
> >> >>+ BUG_ON(offset >= PAGE_SIZE);
> >> >>+
> >> >>+ endpage = page;
> >> >>+ endoffset = offset + size - 1;
> >> >>+ if (endoffset >= PAGE_SIZE) {
> >> >>+ endpage = get_next_page(page);
> >> >>+ BUG_ON(!endpage);
> >> >>+ endoffset -= PAGE_SIZE;
> >> >>+ }
> >> >>
> >> >>- return off + obj_idx * class_size;
> >> >>+ return encode_ptr(endpage, endoffset);
> >> >> }
> >> >>
> >> >> static void reset_page(struct page *page)
> >> >>@@ -506,14 +558,13 @@ static void free_zspage(struct page *first_page)
> >> >> /* Initialize a newly allocated zspage */
> >> >> static void init_zspage(struct page *first_page, struct size_class *class)
> >> >> {
> >> >>- unsigned long off = 0;
> >> >>+ unsigned long off = 0, next_off = 0;
> >> >> struct page *page = first_page;
> >> >>
> >> >> BUG_ON(!is_first_page(first_page));
> >> >> while (page) {
> >> >> struct page *next_page;
> >> >> struct link_free *link;
> >> >>- unsigned int i, objs_on_page;
> >> >>
> >> >> /*
> >> >> * page->index stores offset of first object starting
> >> >>@@ -526,14 +577,12 @@ static void init_zspage(struct page *first_page, struct size_class *class)
> >> >>
> >> >> link = (struct link_free *)kmap_atomic(page) +
> >> >> off / sizeof(*link);
> >> >>- objs_on_page = (PAGE_SIZE - off) / class->size;
> >> >>
> >> >>- for (i = 1; i <= objs_on_page; i++) {
> >> >>- off += class->size;
> >> >>- if (off < PAGE_SIZE) {
> >> >>- link->next = obj_location_to_handle(page, i);
> >> >>- link += class->size / sizeof(*link);
> >> >>- }
> >> >>+ next_off = off + class->size;
> >> >>+ while (next_off < PAGE_SIZE) {
> >> >>+ link->next = encode_ptr(page, next_off);
> >> >>+ link += class->size / sizeof(*link);
> >> >>+ next_off += class->size;
> >> >> }
> >> >>
> >> >> /*
> >> >>@@ -542,10 +591,11 @@ static void init_zspage(struct page *first_page, struct size_class *class)
> >> >> * page (if present)
> >> >> */
> >> >> next_page = get_next_page(page);
> >> >>- link->next = obj_location_to_handle(next_page, 0);
> >> >>+ next_off = next_page ? next_off - PAGE_SIZE : 0;
> >> >>+ link->next = encode_ptr(next_page, next_off);
> >> >> kunmap_atomic(link);
> >> >> page = next_page;
> >> >>- off = (off + class->size) % PAGE_SIZE;
> >> >>+ off = next_off;
> >> >> }
> >> >> }
> >> >>
> >> >>@@ -596,7 +646,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
> >> >>
> >> >> init_zspage(first_page, class);
> >> >>
> >> >>- first_page->freelist = obj_location_to_handle(first_page, 0);
> >> >>+ first_page->freelist = encode_ptr(first_page, 0);
> >> >> /* Maximum number of objects we can store in this zspage */
> >> >> first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
> >> >>
> >> >>@@ -871,7 +921,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >> >> struct size_class *class;
> >> >>
> >> >> struct page *first_page, *m_page;
> >> >>- unsigned long m_objidx, m_offset;
> >> >>+ unsigned int m_offset;
> >> >>
> >> >> if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
> >> >> return 0;
> >> >>@@ -895,8 +945,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >> >> }
> >> >>
> >> >> obj = (unsigned long)first_page->freelist;
> >> >>- obj_handle_to_location(obj, &m_page, &m_objidx);
> >> >>- m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
> >> >>+ decode_ptr(obj, &m_page, &m_offset);
> >> >>
> >> >> link = (struct link_free *)kmap_atomic(m_page) +
> >> >> m_offset / sizeof(*link);
> >> >>@@ -907,6 +956,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
> >> >> first_page->inuse++;
> >> >> /* Now move the zspage to another fullness group, if required */
> >> >> fix_fullness_group(pool, first_page);
> >> >>+
> >> >>+ obj = (unsigned long)obj_location_to_handle(m_page, m_offset,
> >> >>+ size, class->size);
> >> >> spin_unlock(&class->lock);
> >> >>
> >> >> return obj;
> >> >>@@ -917,7 +969,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> >> >> {
> >> >> struct link_free *link;
> >> >> struct page *first_page, *f_page;
> >> >>- unsigned long f_objidx, f_offset;
> >> >>+ unsigned long f_offset;
> >> >>
> >> >> int class_idx;
> >> >> struct size_class *class;
> >> >>@@ -926,12 +978,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> >> >> if (unlikely(!obj))
> >> >> return;
> >> >>
> >> >>- obj_handle_to_location(obj, &f_page, &f_objidx);
> >> >>+ f_page = obj_handle_to_page(obj);
> >> >> first_page = get_first_page(f_page);
> >> >>
> >> >> get_zspage_mapping(first_page, &class_idx, &fullness);
> >> >> class = &pool->size_class[class_idx];
> >> >>- f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
> >> >>+ f_offset = obj_handle_to_offset(obj, class->size);
> >> >>
> >> >> spin_lock(&class->lock);
> >> >>
> >> >>@@ -940,7 +992,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
> >> >> + f_offset);
> >> >> link->next = first_page->freelist;
> >> >> kunmap_atomic(link);
> >> >>- first_page->freelist = (void *)obj;
> >> >>+ first_page->freelist = encode_ptr(f_page, f_offset);
> >> >>
> >> >> first_page->inuse--;
> >> >> fullness = fix_fullness_group(pool, first_page);
> >> >>@@ -970,10 +1022,10 @@ EXPORT_SYMBOL_GPL(zs_free);
> >> >> * This function returns with preemption and page faults disabled.
> >> >> */
> >> >> void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> >> >>- enum zs_mapmode mm)
> >> >>+ enum zs_mapmode mm)
> >> >> {
> >> >> struct page *page;
> >> >>- unsigned long obj_idx, off;
> >> >>+ unsigned long off;
> >> >>
> >> >> unsigned int class_idx;
> >> >> enum fullness_group fg;
> >> >>@@ -990,10 +1042,10 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> >> >> */
> >> >> BUG_ON(in_interrupt());
> >> >>
> >> >>- obj_handle_to_location(handle, &page, &obj_idx);
> >> >>+ page = obj_handle_to_page(handle);
> >> >> get_zspage_mapping(get_first_page(page), &class_idx, &fg);
> >> >> class = &pool->size_class[class_idx];
> >> >>- off = obj_idx_to_offset(page, obj_idx, class->size);
> >> >>+ off = obj_handle_to_offset(handle, class->size);
> >> >>
> >> >> area = &get_cpu_var(zs_map_area);
> >> >> area->vm_mm = mm;
> >> >>@@ -1015,7 +1067,7 @@ EXPORT_SYMBOL_GPL(zs_map_object);
> >> >> void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
> >> >> {
> >> >> struct page *page;
> >> >>- unsigned long obj_idx, off;
> >> >>+ unsigned long off;
> >> >>
> >> >> unsigned int class_idx;
> >> >> enum fullness_group fg;
> >> >>@@ -1024,10 +1076,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
> >> >>
> >> >> BUG_ON(!handle);
> >> >>
> >> >>- obj_handle_to_location(handle, &page, &obj_idx);
> >> >>+ page = obj_handle_to_page(handle);
> >> >> get_zspage_mapping(get_first_page(page), &class_idx, &fg);
> >> >> class = &pool->size_class[class_idx];
> >> >>- off = obj_idx_to_offset(page, obj_idx, class->size);
> >> >>+ off = obj_handle_to_offset(handle, class->size);
> >> >>
> >> >> area = &__get_cpu_var(zs_map_area);
> >> >> if (off + class->size <= PAGE_SIZE)
> >> >>@@ -1045,6 +1097,29 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
> >> >> }
> >> >> EXPORT_SYMBOL_GPL(zs_unmap_object);
> >> >>
> >> >>+size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle)
> >> >>+{
> >> >>+ struct page *endpage;
> >> >>+ unsigned int endoffset, size;
> >> >>+
> >> >>+ unsigned int class_idx;
> >> >>+ enum fullness_group fg;
> >> >>+ struct size_class *class;
> >> >>+
> >> >>+ decode_ptr(handle, &endpage, &endoffset);
> >> >>+ get_zspage_mapping(endpage, &class_idx, &fg);
> >> >>+ class = &pool->size_class[class_idx];
> >> >>+
> >> >>+ size = endoffset + 1;
> >> >>+ if (endoffset < get_page_index(endpage))
> >> >>+ size += class->size - get_page_index(endpage);
> >> >>+ else
> >> >>+ size -= rounddown(endoffset, class->size);
> >> >>+
> >> >>+ return size;
> >> >>+}
> >> >>+EXPORT_SYMBOL_GPL(zs_get_object_size);
> >> >>+
> >> >> u64 zs_get_total_size_bytes(struct zs_pool *pool)
> >> >> {
> >> >> int i;
> >> >>diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/drivers/staging/zsmalloc/zsmalloc.h
> >> >>index de2e8bf..2830fdf 100644
> >> >>--- a/drivers/staging/zsmalloc/zsmalloc.h
> >> >>+++ b/drivers/staging/zsmalloc/zsmalloc.h
> >> >>@@ -38,6 +38,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> >> >> enum zs_mapmode mm);
> >> >> void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
> >> >>
> >> >>+size_t zs_get_object_size(struct zs_pool *pool, unsigned long handle);
> >> >> u64 zs_get_total_size_bytes(struct zs_pool *pool);
> >> >>
> >> >> #endif
> >> >>--
> >> >>1.7.10.4
> >> >>
> >> >>--
> >> >>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >> >>the body of a message to [email protected]
> >> >>More majordomo info at http://vger.kernel.org/majordomo-info.html
> >> >>Please read the FAQ at http://www.tux.org/lkml/
> >> >
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >> the body of a message to [email protected]
> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
> >> Please read the FAQ at http://www.tux.org/lkml/
> >
> > --
> > Kind regards,
> > Minchan Kim
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Kind regards,
Minchan Kim

2012-11-30 02:29:06

by Nitin Gupta

[permalink] [raw]
Subject: Re: [PATCH 1/2] zsmalloc: add function to query object size

On Thu, Nov 29, 2012 at 5:58 PM, Minchan Kim <[email protected]> wrote:
> On Thu, Nov 29, 2012 at 04:53:15PM -0800, Nitin Gupta wrote:
>> On Thu, Nov 29, 2012 at 4:03 PM, Minchan Kim <[email protected]> wrote:
>> > On Thu, Nov 29, 2012 at 01:09:56AM -0800, Nitin Gupta wrote:
>> >> On 11/28/2012 11:45 PM, Minchan Kim wrote:
>> >> >On Mon, Nov 26, 2012 at 11:26:40PM -0800, Nitin Gupta wrote:
>> >> >>Adds zs_get_object_size(handle) which provides the size of
>> >> >>the given object. This is useful since the user (zram etc.)
>> >> >>now do not have to maintain object sizes separately, saving
>> >> >>on some metadata size (4b per page).
>> >> >>
>> >> >>The object handle encodes <page, offset> pair which currently points
>> >> >
>> >> >Nitpick.
>> >> >
>> >> ><page, index> in descrption would be proper instead of
>> >> ><page, offset>. You are going to replace <page, idx> with <page, offset>.
>> >> >
>> >>
>> >> I think 'offset' conveys the meaning more clearly; 'index' is after
>> >> all just a chopped-off version of offset :)
>> >
>> > In my perceptoin, offset means location from some point by some byte
>> > while index is thing we have to multiply sizeof(object) to get.
>> > So you used index before the patch but now you try to use offset instead of
>> > index.
>> >
>> > Anyway, it's minor nitpick. Never mind if you don't agree. :)
>> >
>> >>
>> >>
>> >> >>to the start of the object. Now, the handle implicitly stores the size
>> >> >>information by pointing to the object's end instead. Since zsmalloc is
>> >> >>a slab based allocator, the start of the object can be easily determined
>> >> >>and the difference between the end offset encoded in the handle and the
>> >> >>start gives us the object size.
>> >> >
>> >> >It's a good idea. Look at just minor comment below.
>> >> >
>> >> >Let's talk with another concern. This patch surely helps current
>> >> >customer's memory usage who should add 4 byte for accounting the
>> >> >statistics while zsmalloc could be slow down.
>> >> >Is it really valuable?
>> >> >
>> >> >Yes. zram/zcache had a tight coupling with zsmalloc so it already
>> >> >lost modularity. :( In this POV, this patch makes sense.
>> >> >But if someone are willing to remove statistics for performance?
>> >> >Although he remove it, zsmalloc is still slow down.
>> >> >
>> >> >Statistic for user of zsmalloc should be cost of user himself, not zsmalloc
>> >> >and it accelerates dependency with customer so it makes changing allocator
>> >> >hard in future. We already had such experience(xvmalloc->zsmalloc). Of course,
>> >> >it's not good that worry future things too early without any plan.
>> >> >So I'm not strong againt you. If any reviewer don't raise an eyebrow,
>> >> >I wil rely on your decision.
>> >> >
>> >>
>> >> Looking over the changes I do not expect any difference in
>> >> performance -- just a bit more arithmetic, however the use of
>> >> get_prev_page() which may dereference a few extra pointers might not
>> >> be really free. Also, my iozone test[1] shows very little difference
>> >> in performance:
>> >
>> > Iozone test isn't enough to prove the minor slow down because it would have
>> > many noise about I/O path and different compression ratio per I/O.
>> >
>> >>
>> >> With just the fix for crash (patch 1/2):
>> >> 9.28user 1159.84system 21:46.54elapsed 89%CPU (0avgtext+0avgdata
>> >> 50200maxresident)k
>> >> 212988544inputs+190660816outputs (1major+16706minor)pagefaults 0swaps
>> >>
>> >> With the new get_object_size() code (patch 1/2 + patch 2/2):
>> >> 9.20user 1112.63system 21:03.61elapsed 88%CPU (0avgtext+0avgdata
>> >> 50200maxresident)k
>> >> 194636568inputs+190500424outputs (1major+16707minor)pagefaults 0swaps
>> >>
>> >> I really cannot explain this ~40s speedup but anyways, I think
>> >> optimizing zsmalloc/zram should be taken up separately, at least
>> >> when this new code does not seem to have any significant effects.
>> >>
>> >>
>> >> [1] iozone test:
>> >> - Create zram of size 1200m
>> >> - Create ext4 fs
>> >> - iozone -a -g 1G
>> >> - In parallel: watch zram_stress
>> >>
>> >> # zram_stress
>> >> sync
>> >> echo 1 | sudo tee /proc/sys/vm/drop_caches
>> >>
>> >>
>> >> Surely, not the most accurate of the tests but gives an idea if
>> >> anything is making a significant difference.
>> >
>> > For more accurate test, it would be better to use zsmapbench by Seth.
>> > https://github.com/spartacus06/zsmapbench
>>
>> Thanks for the pointer. I also found fio which can generate I/O with different
>> levels of compressibility. These would help with future evaluations.
>>
>> > Frankly speaking, I don't expect it has a significant regression as you said.
>> > More concern to me is we are going to make tight coupling with zram/zcache +
>> > zsmalloc. It makes changing from zsmalloc to smarter allocator hard.
>>
>> I could not understand how this patch increasing zram + zsmalloc coupling.
>> All we need from any potential allocator replacement is an interface to
>> provide the object size given its handle. If the new allocation cannot
>
> That's it. Why should general allocator support such special function?
>
>> support that then the we can create another size[] just like handle[] to
>> maintain sizes. So, its really simple to replace zsmalloc. if required, at
>> least for the zram.
>
> You're right. It's simple now but the concern is that we are approaching to
> put the specific function which should be customer side to general allocator
> side. Although it's simple at the moment, it could be severe if such special
> functionalitys are accumulated.
>
>>
>>
>> > The reason I have a such concern is that I have a TODO which supports
>> > swap-over-zram pages migratin to storage swap when zram is full. Yes. It's a
>> > just plan, no schedule at the moment so I can't insist on but if i try it in
>> > future, I might want to replace zsmalloc to another. In case of that, tight
>> > coupling would be a another hurdle.
>>
>> For me, its hard to see what kind of issues you might face during this
>> zram-to-disk migration work, especially from the allocator side. Anyways, this
>
> The reason is that I want to make free pages via such migration when memory pressure
> is very severe. For it, we need some scheme to select victm zram objects like LRU.
> In case of that, although we migrate them, we can't get a free page if they are
> spread over lots of other zram pages. It's already problem in slab-based allocator.
>
> If I should develop a allocator for preventing above my concern, I never want to
> provide getting objsize function from new allocator.
>
>
>> would be an interesting feature, so please let me know if you get any issues
>> I can help with.
>>
>> > But as I mentioned, I shouldn't prevent your great work by my ideal plan which
>> > even don't have real schedule. So still I can follow your opinion.
>> > Please resend the patch with fix if you think it's really worthy. :)
>> >
>>
>> I didn't get it. Do you want any changes in this patch? or in patch 2/2?
>> In my opinion this patch should be included since you are getting get_size()
>> functionality for almost no cost.
>
> Hmm, You mentioned like below , so I guess you found some bug. No?
> With just the fix for crash (patch 1/2):
>

I was just wondering why you wanted a resend of these patches if there
were no issues about the changelog message or the code. Anyways, since
Greg wants a resend, I will do it soon.

Thanks,
Nitin