2014-12-05 00:49:55

by Jaegeuk Kim

[permalink] [raw]
Subject: [PATCH 1/3] f2fs: fix missing kmem_cache_free

This patch fixes missing kmem_cache_free when handling errors.

Signed-off-by: Jaegeuk Kim <[email protected]>
---
fs/f2fs/node.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b1466cf..c59341d 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -158,7 +158,7 @@ retry:
head->entry_cnt = 0;

if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
- cond_resched();
+ kmem_cache_free(nat_entry_set_slab, head);
goto retry;
}
}
--
2.1.1


2014-12-05 00:50:05

by Jaegeuk Kim

[permalink] [raw]
Subject: [PATCH 2/3] f2fs: use rw_semaphore for nat entry lock

Previoulsy, we used rwlock for nat_entry lock.
But, now we have a lot of complex operations in set_node_addr.
(e.g., allocating kernel memories, handling radix_trees, and so on)

So, this patches tries to change spinlock to rw_semaphore to give CPUs to other
threads.

Signed-off-by: Jaegeuk Kim <[email protected]>
---
fs/f2fs/f2fs.h | 2 +-
fs/f2fs/node.c | 52 ++++++++++++++++++++++++++--------------------------
2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d042813..c873140 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -332,7 +332,7 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
- rwlock_t nat_tree_lock; /* protect nat_tree_lock */
+ struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
struct list_head nat_entries; /* cached nat entry list (clean) */
unsigned int nat_cnt; /* the # of cached nat entries */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index c59341d..b47555f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -196,11 +196,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool is_cp = true;

- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
- read_unlock(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
return is_cp;
}

@@ -210,11 +210,11 @@ bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool fsynced = false;

- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
fsynced = true;
- read_unlock(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
return fsynced;
}

@@ -224,13 +224,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool need_update = true;

- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
need_update = false;
- read_unlock(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
return need_update;
}

@@ -258,17 +258,17 @@ static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
{
struct nat_entry *e;
retry:
- write_lock(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (!e) {
e = grab_nat_entry(nm_i, nid);
if (!e) {
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
goto retry;
}
node_info_from_raw_nat(&e->ni, ne);
}
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
}

static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -277,12 +277,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
retry:
- write_lock(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
e = grab_nat_entry(nm_i, ni->nid);
if (!e) {
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
goto retry;
}
e->ni = *ni;
@@ -326,7 +326,7 @@ retry:
set_nat_flag(e, HAS_FSYNCED_INODE, true);
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
}
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
}

int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -336,7 +336,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
if (available_free_memory(sbi, NAT_ENTRIES))
return 0;

- write_lock(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
struct nat_entry *ne;
ne = list_first_entry(&nm_i->nat_entries,
@@ -344,7 +344,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
__del_from_nat_cache(nm_i, ne);
nr_shrink--;
}
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
return nr_shrink;
}

@@ -367,14 +367,14 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
ni->nid = nid;

/* Check nat cache */
- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
}
- read_unlock(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
if (e)
return;

@@ -1432,13 +1432,13 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)

if (build) {
/* do not add allocated nids */
- read_lock(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
if (ne &&
(!get_nat_flag(ne, IS_CHECKPOINTED) ||
nat_get_blkaddr(ne) != NULL_ADDR))
allocated = true;
- read_unlock(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
if (allocated)
return 0;
}
@@ -1827,20 +1827,20 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)

raw_ne = nat_in_journal(sum, i);
retry:
- write_lock(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
if (ne)
goto found;

ne = grab_nat_entry(nm_i, nid);
if (!ne) {
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
goto retry;
}
node_info_from_raw_nat(&ne->ni, &raw_ne);
found:
__set_nat_cache_dirty(nm_i, ne);
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
}
update_nats_in_cursum(sum, -i);
mutex_unlock(&curseg->curseg_mutex);
@@ -1911,10 +1911,10 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
}
raw_nat_from_node_info(raw_ne, &ne->ni);

- write_lock(&NM_I(sbi)->nat_tree_lock);
+ down_write(&NM_I(sbi)->nat_tree_lock);
nat_reset_flag(ne);
__clear_nat_cache_dirty(NM_I(sbi), ne);
- write_unlock(&NM_I(sbi)->nat_tree_lock);
+ up_write(&NM_I(sbi)->nat_tree_lock);

if (nat_get_blkaddr(ne) == NULL_ADDR)
add_free_nid(sbi, nid, false);
@@ -2000,7 +2000,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)

mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->free_nid_list_lock);
- rwlock_init(&nm_i->nat_tree_lock);
+ init_rwsem(&nm_i->nat_tree_lock);

nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -2056,7 +2056,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->free_nid_list_lock);

/* destroy nat cache */
- write_lock(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
@@ -2065,7 +2065,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
__del_from_nat_cache(nm_i, natvec[idx]);
}
f2fs_bug_on(sbi, nm_i->nat_cnt);
- write_unlock(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);

kfree(nm_i->nat_bitmap);
sbi->nm_info = NULL;
--
2.1.1

2014-12-05 00:50:11

by Jaegeuk Kim

[permalink] [raw]
Subject: [PATCH 3/3] f2fs: call radix_tree_preload before radix_tree_insert

This patch tries to fix:

BUG: using smp_processor_id() in preemptible [00000000] code: f2fs_gc-254:0/384
(radix_tree_node_alloc+0x14/0x74) from [<c033d8a0>] (radix_tree_insert+0x110/0x200)
(radix_tree_insert+0x110/0x200) from [<c02e8264>] (gc_data_segment+0x340/0x52c)
(gc_data_segment+0x340/0x52c) from [<c02e8658>] (f2fs_gc+0x208/0x400)
(f2fs_gc+0x208/0x400) from [<c02e8a98>] (gc_thread_func+0x248/0x28c)
(gc_thread_func+0x248/0x28c) from [<c0139944>] (kthread+0xa0/0xac)
(kthread+0xa0/0xac) from [<c0105ef8>] (ret_from_fork+0x14/0x3c)

The reason is that f2fs calls radix_tree_insert under enabled preemption.
So, before calling it, we need to call radix_tree_preload.

Otherwise, we should use _GFP_WAIT for the radix tree, and use mutex or
semaphore to cover the radix tree operations.

Signed-off-by: Jaegeuk Kim <[email protected]>
---
fs/f2fs/checkpoint.c | 8 ++++++++
fs/f2fs/gc.c | 6 ++----
fs/f2fs/node.c | 6 +++---
3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 20a917b..6a81b73 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -304,6 +304,11 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
struct inode_management *im = &sbi->im[type];
struct ino_entry *e;
retry:
+ if (radix_tree_preload(GFP_NOFS)) {
+ cond_resched();
+ goto retry;
+ }
+
spin_lock(&im->ino_lock);

e = radix_tree_lookup(&im->ino_root, ino);
@@ -311,11 +316,13 @@ retry:
e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC);
if (!e) {
spin_unlock(&im->ino_lock);
+ radix_tree_preload_end();
goto retry;
}
if (radix_tree_insert(&im->ino_root, ino, e)) {
spin_unlock(&im->ino_lock);
kmem_cache_free(ino_entry_slab, e);
+ radix_tree_preload_end();
goto retry;
}
memset(e, 0, sizeof(struct ino_entry));
@@ -326,6 +333,7 @@ retry:
im->ino_num++;
}
spin_unlock(&im->ino_lock);
+ radix_tree_preload_end();
}

static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index a1af74f..2c58c58 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -351,7 +351,6 @@ static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino)
static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
{
struct inode_entry *new_ie;
- int ret;

if (inode == find_gc_inode(gc_list, inode->i_ino)) {
iput(inode);
@@ -361,8 +360,7 @@ retry:
new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS);
new_ie->inode = inode;

- ret = radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
- if (ret) {
+ if (radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie)) {
kmem_cache_free(winode_slab, new_ie);
goto retry;
}
@@ -703,7 +701,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
struct cp_control cpc;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
- .iroot = RADIX_TREE_INIT(GFP_ATOMIC),
+ .iroot = RADIX_TREE_INIT(GFP_NOFS),
};

cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b47555f..fa115d0 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1992,10 +1992,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;

- INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
+ INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_NOIO);
INIT_LIST_HEAD(&nm_i->free_nid_list);
- INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
- INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
+ INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
+ INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
INIT_LIST_HEAD(&nm_i->nat_entries);

mutex_init(&nm_i->build_lock);
--
2.1.1

2014-12-05 03:50:38

by Gu Zheng

[permalink] [raw]
Subject: Re: [PATCH 1/3] f2fs: fix missing kmem_cache_free

Hi Jaegeuk,
On 12/05/2014 08:49 AM, Jaegeuk Kim wrote:

> This patch fixes missing kmem_cache_free when handling errors.
>
> Signed-off-by: Jaegeuk Kim <[email protected]>
> ---
> fs/f2fs/node.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index b1466cf..c59341d 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -158,7 +158,7 @@ retry:
> head->entry_cnt = 0;
>
> if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
> - cond_resched();
> + kmem_cache_free(nat_entry_set_slab, head);

Why not reuse the allocated entry?
This routine is under nat_tree_lock, so the free_old->research->alloc_new
is needless, because no other ones can race with us.

> goto retry;

And radix_tree_insert can only fail -ENOMEM here, IMO, the in-time retry step
makes very little sense here. How about retaining the "cond_resched()" and retry
insert later?

If I misread something, please correct me.:)

Thanks,
Gu

> }
> }

2014-12-05 18:18:45

by Jaegeuk Kim

[permalink] [raw]
Subject: Re: [PATCH 1/3] f2fs: fix missing kmem_cache_free

Hi Gu,

On Fri, Dec 05, 2014 at 11:34:49AM +0800, Gu Zheng wrote:
> Hi Jaegeuk,
> On 12/05/2014 08:49 AM, Jaegeuk Kim wrote:
>
> > This patch fixes missing kmem_cache_free when handling errors.
> >
> > Signed-off-by: Jaegeuk Kim <[email protected]>
> > ---
> > fs/f2fs/node.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> > index b1466cf..c59341d 100644
> > --- a/fs/f2fs/node.c
> > +++ b/fs/f2fs/node.c
> > @@ -158,7 +158,7 @@ retry:
> > head->entry_cnt = 0;
> >
> > if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
> > - cond_resched();
> > + kmem_cache_free(nat_entry_set_slab, head);
>
> Why not reuse the allocated entry?
> This routine is under nat_tree_lock, so the free_old->research->alloc_new
> is needless, because no other ones can race with us.

IMO, this issue is quite different one that you're mentioning.
This patch just fixes a missing kfree.

>
> > goto retry;
>
> And radix_tree_insert can only fail -ENOMEM here, IMO, the in-time retry step
> makes very little sense here. How about retaining the "cond_resched()" and retry
> insert later?

I think the following patches related to radix_tree_preload that I submitted
would address that.
But, still it seems that I need to review the retrial paths again.

Thanks,

>
> If I misread something, please correct me.:)
>
> Thanks,
> Gu
>
> > }
> > }