Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1758518AbcCVKRO (ORCPT ); Tue, 22 Mar 2016 06:17:14 -0400 Received: from mail-ig0-f195.google.com ([209.85.213.195]:36447 "EHLO mail-ig0-f195.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756105AbcCVKRJ convert rfc822-to-8bit (ORCPT ); Tue, 22 Mar 2016 06:17:09 -0400 MIME-Version: 1.0 In-Reply-To: <56EFCC2B.5070809@lightnvm.io> References: <1458453092-2184-1-git-send-email-ww.tao0320@gmail.com> <56EFCC2B.5070809@lightnvm.io> Date: Tue, 22 Mar 2016 18:17:08 +0800 Message-ID: Subject: Re: [PATCH v2] lightnvm: divide global reverse translation map into per lun From: Wenwei Tao To: =?UTF-8?Q?Matias_Bj=C3=B8rling?= Cc: =?UTF-8?Q?Javier_Gonz=C3=A1lez?= , linux-kernel@vger.kernel.org, linux-block@vger.kernel.org Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8BIT Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 18720 Lines: 521 Seems it is the variable pg_offset in rrpc_page_invalidate that cause the panic. It's a u64 variable we use its low 32bit to store the offset value, leave the high 32bit in a undetermined status, this will cause test_and_set_bit access unexpected memory, cause kernel panic. It's should be initialized to zero before use. I add some log to verify this assuption, bleow is the log [ 485.260549] rrpc: paddr: 400 sec_per_blk: 100 offset: ffff880000000000 [ 485.260563] BUG: unable to handle kernel paging request at ffffba0000d4f230 offset value ffff880000000000 may be passed to test_and_set_bit instead of 0. 2016-03-21 18:25 GMT+08:00 Matias Bjørling : > On 03/20/2016 06:51 AM, Wenwei Tao wrote: >> >> Divide the target's global reverse translation map into per lun, >> to prepare support for the non-continuous lun target creation. >> >> Signed-off-by: Wenwei Tao >> --- >> >> Changes since v1 >> -fix merge/rebase mistake in rrpc_block_map_update(). >> -remove variables poffset and lun_offset in rrpc structure >> since no one use them now. >> >> drivers/lightnvm/rrpc.c | 181 >> +++++++++++++++++++++++++++-------------------- >> drivers/lightnvm/rrpc.h | 9 ++- >> include/linux/lightnvm.h | 1 + >> 3 files changed, 111 insertions(+), 80 deletions(-) >> >> diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c >> index 3ab6495..88e2ce5 100644 >> --- a/drivers/lightnvm/rrpc.c >> +++ b/drivers/lightnvm/rrpc.c >> @@ -23,28 +23,35 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct >> bio *bio, >> struct nvm_rq *rqd, unsigned long flags); >> >> #define rrpc_for_each_lun(rrpc, rlun, i) \ >> - for ((i) = 0, rlun = &(rrpc)->luns[0]; \ >> - (i) < (rrpc)->nr_luns; (i)++, rlun = >> &(rrpc)->luns[(i)]) >> + for ((i) = 0, rlun = &(rrpc)->luns[0]; \ >> + (i) < (rrpc)->nr_luns; (i)++, rlun = &(rrpc)->luns[(i)]) >> + >> +static inline u64 lun_poffset(struct nvm_lun *lun, struct nvm_dev *dev) >> +{ >> + return lun->id * dev->sec_per_lun; >> +} >> >> static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a) >> { >> struct rrpc_block *rblk = a->rblk; >> - unsigned int pg_offset; >> + struct rrpc_lun *rlun = rblk->rlun; >> + u64 pg_offset; >> >> - lockdep_assert_held(&rrpc->rev_lock); >> + lockdep_assert_held(&rlun->rev_lock); >> >> if (a->addr == ADDR_EMPTY || !rblk) >> return; >> >> spin_lock(&rblk->lock); >> >> - div_u64_rem(a->addr, rrpc->dev->sec_per_blk, &pg_offset); >> + div_u64_rem(a->addr, rrpc->dev->sec_per_blk, (u32 *)&pg_offset); >> WARN_ON(test_and_set_bit(pg_offset, rblk->invalid_pages)); >> rblk->nr_invalid_pages++; >> >> spin_unlock(&rblk->lock); >> >> - rrpc->rev_trans_map[a->addr - rrpc->poffset].addr = ADDR_EMPTY; >> + pg_offset = lun_poffset(rlun->parent, rrpc->dev); >> + rlun->rev_trans_map[a->addr - pg_offset].addr = ADDR_EMPTY; >> } >> >> static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba, >> @@ -52,14 +59,15 @@ static void rrpc_invalidate_range(struct rrpc *rrpc, >> sector_t slba, >> { >> sector_t i; >> >> - spin_lock(&rrpc->rev_lock); >> for (i = slba; i < slba + len; i++) { >> struct rrpc_addr *gp = &rrpc->trans_map[i]; >> + struct rrpc_lun *rlun = gp->rblk->rlun; >> >> + spin_lock(&rlun->rev_lock); >> rrpc_page_invalidate(rrpc, gp); >> + spin_unlock(&rlun->rev_lock); >> gp->rblk = NULL; >> } >> - spin_unlock(&rrpc->rev_lock); >> } >> >> static struct nvm_rq *rrpc_inflight_laddr_acquire(struct rrpc *rrpc, >> @@ -116,15 +124,6 @@ static int block_is_full(struct rrpc *rrpc, struct >> rrpc_block *rblk) >> return (rblk->next_page == rrpc->dev->sec_per_blk); >> } >> >> -/* Calculate relative addr for the given block, considering instantiated >> LUNs */ >> -static u64 block_to_rel_addr(struct rrpc *rrpc, struct rrpc_block *rblk) >> -{ >> - struct nvm_block *blk = rblk->parent; >> - int lun_blk = blk->id % (rrpc->dev->blks_per_lun * rrpc->nr_luns); >> - >> - return lun_blk * rrpc->dev->sec_per_blk; >> -} >> - >> /* Calculate global addr for the given block */ >> static u64 block_to_addr(struct rrpc *rrpc, struct rrpc_block *rblk) >> { >> @@ -291,13 +290,14 @@ static void rrpc_end_sync_bio(struct bio *bio) >> static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block >> *rblk) >> { >> struct request_queue *q = rrpc->dev->q; >> + struct rrpc_lun *rlun = rblk->rlun; >> struct rrpc_rev_addr *rev; >> struct nvm_rq *rqd; >> struct bio *bio; >> struct page *page; >> int slot; >> int nr_sec_per_blk = rrpc->dev->sec_per_blk; >> - u64 phys_addr; >> + u64 phys_addr, poffset; >> DECLARE_COMPLETION_ONSTACK(wait); >> >> if (bitmap_full(rblk->invalid_pages, nr_sec_per_blk)) >> @@ -315,6 +315,7 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, >> struct rrpc_block *rblk) >> return -ENOMEM; >> } >> >> + poffset = lun_poffset(rlun->parent, rrpc->dev); >> while ((slot = find_first_zero_bit(rblk->invalid_pages, >> nr_sec_per_blk)) < >> nr_sec_per_blk) { >> >> @@ -322,23 +323,23 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, >> struct rrpc_block *rblk) >> phys_addr = rblk->parent->id * nr_sec_per_blk + slot; >> >> try: >> - spin_lock(&rrpc->rev_lock); >> + spin_lock(&rlun->rev_lock); >> /* Get logical address from physical to logical table */ >> - rev = &rrpc->rev_trans_map[phys_addr - rrpc->poffset]; >> + rev = &rlun->rev_trans_map[phys_addr - poffset]; >> /* already updated by previous regular write */ >> if (rev->addr == ADDR_EMPTY) { >> - spin_unlock(&rrpc->rev_lock); >> + spin_unlock(&rlun->rev_lock); >> continue; >> } >> >> rqd = rrpc_inflight_laddr_acquire(rrpc, rev->addr, 1); >> if (IS_ERR_OR_NULL(rqd)) { >> - spin_unlock(&rrpc->rev_lock); >> + spin_unlock(&rlun->rev_lock); >> schedule(); >> goto try; >> } >> >> - spin_unlock(&rrpc->rev_lock); >> + spin_unlock(&rlun->rev_lock); >> >> /* Perform read to do GC */ >> bio->bi_iter.bi_sector = rrpc_get_sector(rev->addr); >> @@ -407,7 +408,7 @@ static void rrpc_block_gc(struct work_struct *work) >> struct rrpc_block *rblk = gcb->rblk; >> struct nvm_dev *dev = rrpc->dev; >> struct nvm_lun *lun = rblk->parent->lun; >> - struct rrpc_lun *rlun = &rrpc->luns[lun->id - rrpc->lun_offset]; >> + struct rrpc_lun *rlun = lun->private; >> >> mempool_free(gcb, rrpc->gcb_pool); >> pr_debug("nvm: block '%lu' being reclaimed\n", rblk->parent->id); >> @@ -510,7 +511,7 @@ static void rrpc_gc_queue(struct work_struct *work) >> struct rrpc_block *rblk = gcb->rblk; >> struct nvm_lun *lun = rblk->parent->lun; >> struct nvm_block *blk = rblk->parent; >> - struct rrpc_lun *rlun = &rrpc->luns[lun->id - rrpc->lun_offset]; >> + struct rrpc_lun *rlun = lun->private; >> >> spin_lock(&rlun->lock); >> list_add_tail(&rblk->prio, &rlun->prio_list); >> @@ -561,22 +562,25 @@ static struct rrpc_lun *rrpc_get_lun_rr(struct rrpc >> *rrpc, int is_gc) >> static struct rrpc_addr *rrpc_update_map(struct rrpc *rrpc, sector_t >> laddr, >> struct rrpc_block *rblk, u64 >> paddr) >> { >> + struct rrpc_lun *rlun = rblk->rlun; >> struct rrpc_addr *gp; >> struct rrpc_rev_addr *rev; >> + u64 poffset = lun_poffset(rlun->parent, rrpc->dev); >> >> BUG_ON(laddr >= rrpc->nr_sects); >> >> gp = &rrpc->trans_map[laddr]; >> - spin_lock(&rrpc->rev_lock); >> + spin_lock(&rlun->rev_lock); >> + >> if (gp->rblk) >> rrpc_page_invalidate(rrpc, gp); >> >> gp->addr = paddr; >> gp->rblk = rblk; >> >> - rev = &rrpc->rev_trans_map[gp->addr - rrpc->poffset]; >> + rev = &rlun->rev_trans_map[gp->addr - poffset]; >> rev->addr = laddr; >> - spin_unlock(&rrpc->rev_lock); >> + spin_unlock(&rlun->rev_lock); >> >> return gp; >> } >> @@ -990,7 +994,6 @@ static int rrpc_gc_init(struct rrpc *rrpc) >> >> static void rrpc_map_free(struct rrpc *rrpc) >> { >> - vfree(rrpc->rev_trans_map); >> vfree(rrpc->trans_map); >> } >> >> @@ -998,8 +1001,8 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 >> *entries, void *private) >> { >> struct rrpc *rrpc = (struct rrpc *)private; >> struct nvm_dev *dev = rrpc->dev; >> - struct rrpc_addr *addr = rrpc->trans_map + slba; >> - struct rrpc_rev_addr *raddr = rrpc->rev_trans_map; >> + struct rrpc_addr *addr; >> + struct rrpc_rev_addr *raddr; >> u64 elba = slba + nlb; >> u64 i; >> >> @@ -1008,9 +1011,15 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, >> __le64 *entries, void *private) >> return -EINVAL; >> } >> >> + slba -= rrpc->soffset >> (ilog2(dev->sec_size) - 9); >> + addr = rrpc->trans_map + slba; >> for (i = 0; i < nlb; i++) { >> + struct rrpc_lun *rlun; >> + struct nvm_lun *lun; >> u64 pba = le64_to_cpu(entries[i]); >> - unsigned int mod; >> + u64 poffset; >> + int lunid; >> + >> /* LNVM treats address-spaces as silos, LBA and PBA are >> * equally large and zero-indexed. >> */ >> @@ -1026,10 +1035,18 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, >> __le64 *entries, void *private) >> if (!pba) >> continue; >> >> - div_u64_rem(pba, rrpc->nr_sects, &mod); >> + lunid = div_u64(pba, dev->sec_per_lun); >> + lun = dev->mt->get_lun(dev, lunid); >> + >> + if (unlikely(!lun)) >> + return -EINVAL; >> + >> + rlun = lun->private; >> + raddr = rlun->rev_trans_map; >> + poffset = lun_poffset(lun, dev); >> >> addr[i].addr = pba; >> - raddr[mod].addr = slba + i; >> + raddr[pba - poffset].addr = slba + i; >> } >> >> return 0; >> @@ -1048,17 +1065,10 @@ static int rrpc_map_init(struct rrpc *rrpc) >> if (!rrpc->trans_map) >> return -ENOMEM; >> >> - rrpc->rev_trans_map = vmalloc(sizeof(struct rrpc_rev_addr) >> - * rrpc->nr_sects); >> - if (!rrpc->rev_trans_map) >> - return -ENOMEM; >> - >> for (i = 0; i < rrpc->nr_sects; i++) { >> struct rrpc_addr *p = &rrpc->trans_map[i]; >> - struct rrpc_rev_addr *r = &rrpc->rev_trans_map[i]; >> >> p->addr = ADDR_EMPTY; >> - r->addr = ADDR_EMPTY; >> } >> >> if (!dev->ops->get_l2p_tbl) >> @@ -1143,25 +1153,69 @@ static void rrpc_luns_free(struct rrpc *rrpc) >> if (!lun) >> break; >> dev->mt->release_lun(dev, lun->id); >> + vfree(rlun->rev_trans_map); >> vfree(rlun->blocks); >> } >> >> kfree(rrpc->luns); >> + rrpc->luns = NULL; >> +} >> + >> +static int rrpc_lun_init(struct rrpc *rrpc, struct rrpc_lun *rlun, >> + struct nvm_lun *lun) >> +{ >> + struct nvm_dev *dev = rrpc->dev; >> + int i; >> + >> + rlun->rev_trans_map = vmalloc(sizeof(struct rrpc_rev_addr) * >> + dev->sec_per_lun); >> + if (!rlun->rev_trans_map) >> + return -ENOMEM; >> + >> + for (i = 0; i < dev->sec_per_lun; i++) { >> + struct rrpc_rev_addr *r = &rlun->rev_trans_map[i]; >> + >> + r->addr = ADDR_EMPTY; >> + } >> + rlun->blocks = vzalloc(sizeof(struct rrpc_block) * >> dev->blks_per_lun); >> + if (!rlun->blocks) { >> + vfree(rlun->rev_trans_map); >> + return -ENOMEM; >> + } >> + >> + for (i = 0; i < dev->blks_per_lun; i++) { >> + struct rrpc_block *rblk = &rlun->blocks[i]; >> + struct nvm_block *blk = &lun->blocks[i]; >> + >> + rblk->parent = blk; >> + rblk->rlun = rlun; >> + INIT_LIST_HEAD(&rblk->prio); >> + spin_lock_init(&rblk->lock); >> + } >> + >> + rlun->rrpc = rrpc; >> + lun->private = rlun; >> + INIT_LIST_HEAD(&rlun->prio_list); >> + INIT_LIST_HEAD(&rlun->open_list); >> + INIT_LIST_HEAD(&rlun->closed_list); >> + INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); >> + spin_lock_init(&rlun->lock); >> + spin_lock_init(&rlun->rev_lock); >> + >> + return 0; >> } >> >> static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) >> { >> struct nvm_dev *dev = rrpc->dev; >> struct rrpc_lun *rlun; >> - int i, j, ret = -EINVAL; >> + int i, ret = -EINVAL; >> >> if (dev->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) >> { >> pr_err("rrpc: number of pages per block too high."); >> return -EINVAL; >> } >> >> - spin_lock_init(&rrpc->rev_lock); >> - >> rrpc->luns = kcalloc(rrpc->nr_luns, sizeof(struct rrpc_lun), >> >> GFP_KERNEL); >> if (!rrpc->luns) >> @@ -1183,31 +1237,9 @@ static int rrpc_luns_init(struct rrpc *rrpc, int >> lun_begin, int lun_end) >> >> rlun = &rrpc->luns[i]; >> rlun->parent = lun; >> - rlun->blocks = vzalloc(sizeof(struct rrpc_block) * >> - rrpc->dev->blks_per_lun); >> - if (!rlun->blocks) { >> - ret = -ENOMEM; >> + ret = rrpc_lun_init(rrpc, rlun, lun); >> + if (ret) >> goto err; >> - } >> - >> - for (j = 0; j < rrpc->dev->blks_per_lun; j++) { >> - struct rrpc_block *rblk = &rlun->blocks[j]; >> - struct nvm_block *blk = &lun->blocks[j]; >> - >> - rblk->parent = blk; >> - rblk->rlun = rlun; >> - INIT_LIST_HEAD(&rblk->prio); >> - spin_lock_init(&rblk->lock); >> - } >> - >> - rlun->rrpc = rrpc; >> - INIT_LIST_HEAD(&rlun->prio_list); >> - INIT_LIST_HEAD(&rlun->open_list); >> - INIT_LIST_HEAD(&rlun->closed_list); >> - >> - INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); >> - spin_lock_init(&rlun->lock); >> - >> rrpc->total_blocks += dev->blks_per_lun; >> rrpc->nr_sects += dev->sec_per_lun; >> >> @@ -1215,6 +1247,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int >> lun_begin, int lun_end) >> >> return 0; >> err: >> + rrpc_luns_free(rrpc); >> return ret; >> } >> >> @@ -1288,15 +1321,16 @@ static sector_t rrpc_capacity(void *private) >> static void rrpc_block_map_update(struct rrpc *rrpc, struct rrpc_block >> *rblk) >> { >> struct nvm_dev *dev = rrpc->dev; >> + struct rrpc_lun *rlun = rblk->rlun; >> int offset; >> struct rrpc_addr *laddr; >> - u64 bpaddr, paddr, pladdr; >> + u64 paddr, pladdr, poffset; >> >> - bpaddr = block_to_rel_addr(rrpc, rblk); >> + poffset = lun_poffset(rlun->parent, dev); >> for (offset = 0; offset < dev->sec_per_blk; offset++) { >> - paddr = bpaddr + offset; >> + paddr = block_to_addr(rrpc, rblk) + offset; >> >> - pladdr = rrpc->rev_trans_map[paddr].addr; >> + pladdr = rlun->rev_trans_map[paddr - poffset].addr; >> if (pladdr == ADDR_EMPTY) >> continue; >> >> @@ -1405,9 +1439,6 @@ static void *rrpc_init(struct nvm_dev *dev, struct >> gendisk *tdisk, >> goto err; >> } >> >> - rrpc->poffset = dev->sec_per_lun * lun_begin; >> - rrpc->lun_offset = lun_begin; >> - >> ret = rrpc_core_init(rrpc); >> if (ret) { >> pr_err("nvm: rrpc: could not initialize core\n"); >> diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h >> index 2653484..590f1b7 100644 >> --- a/drivers/lightnvm/rrpc.h >> +++ b/drivers/lightnvm/rrpc.h >> @@ -87,6 +87,10 @@ struct rrpc_lun { >> >> struct work_struct ws_gc; >> >> + /* store a reverse map for garbage collection */ >> + struct rrpc_rev_addr *rev_trans_map; >> + spinlock_t rev_lock; >> + >> spinlock_t lock; >> }; >> >> @@ -98,8 +102,6 @@ struct rrpc { >> struct gendisk *disk; >> >> sector_t soffset; /* logical sector offset */ >> - u64 poffset; /* physical page offset */ >> - int lun_offset; >> >> int nr_luns; >> struct rrpc_lun *luns; >> @@ -124,9 +126,6 @@ struct rrpc { >> * addresses are used when writing to the disk block device. >> */ >> struct rrpc_addr *trans_map; >> - /* also store a reverse map for garbage collection */ >> - struct rrpc_rev_addr *rev_trans_map; >> - spinlock_t rev_lock; >> >> struct rrpc_inflight inflights; >> >> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h >> index cdcb2cc..419a3db 100644 >> --- a/include/linux/lightnvm.h >> +++ b/include/linux/lightnvm.h >> @@ -276,6 +276,7 @@ struct nvm_lun { >> spinlock_t lock; >> >> struct nvm_block *blocks; >> + void *private; >> }; >> >> enum { >> > > It seems like there is a race-condition when GC hits. It panics on > page_invalidation. You can test with the following fio script: > > [global] > bs=4k > ioengine=libaio > iodepth=1 > size=1G > direct=1 > rw=randwrite > time_based > runtime=300 > > [p1] > numjobs=1 > filename=/dev/test2 >