LinuxLists.cc - [PATCH RFC 0/5] lightnvm: Introduce System Blocks

2015-12-14 13:17:33

Subject: [PATCH RFC 0/5] lightnvm: Introduce System Blocks

Hi,

Currently a device is brought up via the media manager, which detects if
a device supports it. We like to allow a drive to be intialized to a
specific media manager and therefore introduce new functionality in the
core to scan a specific set of flash blocks, that maintains what we call
system blocks.

With this patchset, the disk should first be initialized to a given
media manager, which then takes control over the device. The core and
media managers are free to update the system block for the device. In
the case of the initialization PPA for the media manager is changed or
for other reasons.

A system block is duplicated in three places to prevent the system block
data to be unreachable. We currently allocate blocks on three different
luns, the first lun from the first channel, the first lun from the
middle channel, and from the first lun in the last channel. If a device
only have a single or two channels, only one or two system blocks are
maintained.

The three luns each have two blocks reserved during initialization,
which amounts to approximately 1.5M updates in total, which is much more
updates that we expect with current workloads.

The first four patches prepares the core to directlt interact with the
device, and the last patch introduces the recovery scheme.

Later patches will add the management functionality and integrate with
the gennvm media manager.

Thanks,
Matias

Matias Bjørling (5):
lightnvm: move ppa erase logic to core
lightnvm: refactor rqd ppa list into set/free
lightnvm: add sync support for submit_io
lightnvm: introduce nvm_submit_ppa
lightnvm: core on-disk initialization

drivers/lightnvm/Makefile | 2 +-
drivers/lightnvm/core.c | 128 +++++++++++
drivers/lightnvm/gennvm.c | 68 +-----
drivers/lightnvm/sysblk.c | 524 +++++++++++++++++++++++++++++++++++++++++++
drivers/nvme/host/lightnvm.c | 7 +
include/linux/lightnvm.h | 39 ++++
6 files changed, 703 insertions(+), 65 deletions(-)
create mode 100644 drivers/lightnvm/sysblk.c

--
2.1.4

2015-12-14 13:18:49

by Matias Bjørling

[permalink] [raw]

Subject: [PATCH RFC 1/5] lightnvm: move ppa erase logic to core

A device may function in single, dual or quad plane mode. The gennvm
media manager manages this with explicit helpers. They convert a single
ppa to 1, 2 or 4 separate ppas in a ppa list. To aid implementation of
recovery and system blocks, this functionality can be moved directly
into the core.

Signed-off-by: Matias Bjørling <[email protected]>
---
drivers/lightnvm/core.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++
drivers/lightnvm/gennvm.c | 68 +++--------------------------------------------
include/linux/lightnvm.h | 3 +++
3 files changed, 74 insertions(+), 64 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 8f41b24..6134339 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -192,6 +192,73 @@ int nvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk)
}
EXPORT_SYMBOL(nvm_erase_blk);

+void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+ int i;
+
+ if (rqd->nr_pages > 1) {
+ for (i = 0; i < rqd->nr_pages; i++)
+ rqd->ppa_list[i] = dev_to_generic_addr(dev,
+ rqd->ppa_list[i]);
+ } else {
+ rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr);
+ }
+}
+EXPORT_SYMBOL(nvm_addr_to_generic_mode);
+
+void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+ int i;
+
+ if (rqd->nr_pages > 1) {
+ for (i = 0; i < rqd->nr_pages; i++)
+ rqd->ppa_list[i] = generic_to_dev_addr(dev,
+ rqd->ppa_list[i]);
+ } else {
+ rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
+ }
+}
+EXPORT_SYMBOL(nvm_generic_to_addr_mode);
+
+int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr ppa)
+{
+ int plane_cnt = 0, pl_idx, ret;
+ struct nvm_rq rqd;
+
+ if (!dev->ops->erase_block)
+ return 0;
+
+ if (dev->plane_mode == NVM_PLANE_SINGLE) {
+ rqd.nr_pages = 1;
+ rqd.ppa_addr = ppa;
+ } else {
+ plane_cnt = (1 << dev->plane_mode);
+ rqd.nr_pages = plane_cnt;
+
+ rqd.ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL,
+ &rqd.dma_ppa_list);
+ if (!rqd.ppa_list) {
+ pr_err("nvm: failed to allocate dma memory\n");
+ return -ENOMEM;
+ }
+
+ for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+ ppa.g.pl = pl_idx;
+ rqd.ppa_list[pl_idx] = ppa;
+ }
+ }
+
+ nvm_generic_to_addr_mode(dev, &rqd);
+
+ ret = dev->ops->erase_block(dev, &rqd);
+
+ if (plane_cnt)
+ nvm_dev_dma_free(dev, rqd.ppa_list, rqd.dma_ppa_list);
+
+ return ret;
+}
+EXPORT_SYMBOL(nvm_erase_ppa);
+
static int nvm_core_init(struct nvm_dev *dev)
{
struct nvm_id *id = &dev->identity;
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index f434e89..ba1e671 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -316,39 +316,13 @@ static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
spin_unlock(&vlun->lock);
}

-static void gennvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
- int i;
-
- if (rqd->nr_pages > 1) {
- for (i = 0; i < rqd->nr_pages; i++)
- rqd->ppa_list[i] = dev_to_generic_addr(dev,
- rqd->ppa_list[i]);
- } else {
- rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr);
- }
-}
-
-static void gennvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
- int i;
-
- if (rqd->nr_pages > 1) {
- for (i = 0; i < rqd->nr_pages; i++)
- rqd->ppa_list[i] = generic_to_dev_addr(dev,
- rqd->ppa_list[i]);
- } else {
- rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
- }
-}
-
static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
{
if (!dev->ops->submit_io)
return 0;

/* Convert address space */
- gennvm_generic_to_addr_mode(dev, rqd);
+ nvm_generic_to_addr_mode(dev, rqd);

rqd->dev = dev;
return dev->ops->submit_io(dev, rqd);
@@ -390,7 +364,7 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
if (dev->ops->set_bb_tbl(dev, rqd, 1))
return;

- gennvm_addr_to_generic_mode(dev, rqd);
+ nvm_addr_to_generic_mode(dev, rqd);

/* look up blocks and mark them as bad */
if (rqd->nr_pages > 1)
@@ -424,43 +398,9 @@ static int gennvm_end_io(struct nvm_rq *rqd, int error)
static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
unsigned long flags)
{
- int plane_cnt = 0, pl_idx, ret;
- struct ppa_addr addr;
- struct nvm_rq rqd;
+ struct ppa_addr addr = block_to_ppa(dev, blk);

- if (!dev->ops->erase_block)
- return 0;
-
- addr = block_to_ppa(dev, blk);
-
- if (dev->plane_mode == NVM_PLANE_SINGLE) {
- rqd.nr_pages = 1;
- rqd.ppa_addr = addr;
- } else {
- plane_cnt = (1 << dev->plane_mode);
- rqd.nr_pages = plane_cnt;
-
- rqd.ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL,
- &rqd.dma_ppa_list);
- if (!rqd.ppa_list) {
- pr_err("gennvm: failed to allocate dma memory\n");
- return -ENOMEM;
- }
-
- for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
- addr.g.pl = pl_idx;
- rqd.ppa_list[pl_idx] = addr;
- }
- }
-
- gennvm_generic_to_addr_mode(dev, &rqd);
-
- ret = dev->ops->erase_block(dev, &rqd);
-
- if (plane_cnt)
- nvm_dev_dma_free(dev, rqd.ppa_list, rqd.dma_ppa_list);
-
- return ret;
+ return nvm_erase_ppa(dev, addr);
}

static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid)
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 034117b..c228dbc 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -427,6 +427,9 @@ extern int nvm_register(struct request_queue *, char *,
extern void nvm_unregister(char *);

extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *);
+extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
+extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
+extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr);
extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *);
#else /* CONFIG_NVM */
struct nvm_dev_ops;
--
2.1.4

2015-12-14 13:17:38

by Matias Bjørling

[permalink] [raw]

Subject: [PATCH RFC 2/5] lightnvm: refactor rqd ppa list into set/free

A device may be driven in single, double or quad plane mode. In that
case, the rqd must have either one, two, or four PPAs set for a single
PPA sent to the device. Refactor this logic into their own
functions to be shared by program/erase/read in the core.

Signed-off-by: Matias Bjørling <[email protected]>
---
drivers/lightnvm/core.c | 71 ++++++++++++++++++++++++++++++++++--------------
include/linux/lightnvm.h | 3 ++
2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 6134339..081b0f5 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -220,40 +220,69 @@ void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
}
EXPORT_SYMBOL(nvm_generic_to_addr_mode);

+int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
+ struct ppa_addr *ppas, int nr_ppas)
+{
+ int i, plane_cnt, pl_idx;
+
+ if (dev->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
+ rqd->nr_pages = 1;
+ rqd->ppa_addr = ppas[0];
+
+ return 0;
+ }
+
+ plane_cnt = (1 << dev->plane_mode);
+ rqd->nr_pages = plane_cnt * nr_ppas;
+
+ if (dev->ops->max_phys_sect < rqd->nr_pages)
+ return -EINVAL;
+
+ rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list);
+ if (!rqd->ppa_list) {
+ pr_err("nvm: failed to allocate dma memory\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_ppas; i++) {
+ for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
+ ppas[i].g.pl = pl_idx;
+ rqd->ppa_list[(i * plane_cnt) + pl_idx] = ppas[i];
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(nvm_set_rqd_ppalist);
+
+void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
+{
+ if (!rqd->ppa_list)
+ return;
+
+ nvm_dev_dma_free(dev, rqd->ppa_list, rqd->dma_ppa_list);
+}
+EXPORT_SYMBOL(nvm_free_rqd_ppalist);
+
int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr ppa)
{
- int plane_cnt = 0, pl_idx, ret;
struct nvm_rq rqd;
+ int ret;

if (!dev->ops->erase_block)
return 0;

- if (dev->plane_mode == NVM_PLANE_SINGLE) {
- rqd.nr_pages = 1;
- rqd.ppa_addr = ppa;
- } else {
- plane_cnt = (1 << dev->plane_mode);
- rqd.nr_pages = plane_cnt;
+ memset(&rqd, 0, sizeof(struct nvm_rq));

- rqd.ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL,
- &rqd.dma_ppa_list);
- if (!rqd.ppa_list) {
- pr_err("nvm: failed to allocate dma memory\n");
- return -ENOMEM;
- }
-
- for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) {
- ppa.g.pl = pl_idx;
- rqd.ppa_list[pl_idx] = ppa;
- }
- }
+ ret = nvm_set_rqd_ppalist(dev, &rqd, &ppa, 1);
+ if (ret)
+ return ret;

nvm_generic_to_addr_mode(dev, &rqd);

ret = dev->ops->erase_block(dev, &rqd);

- if (plane_cnt)
- nvm_dev_dma_free(dev, rqd.ppa_list, rqd.dma_ppa_list);
+ nvm_free_rqd_ppalist(dev, &rqd);

return ret;
}
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index c228dbc..2fd6871 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -429,6 +429,9 @@ extern void nvm_unregister(char *);
extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *);
extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
+extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
+ struct ppa_addr *, int);
+extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr);
extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *);
#else /* CONFIG_NVM */
--
2.1.4

2015-12-14 13:17:36

by Matias Bjørling

[permalink] [raw]

Subject: [PATCH RFC 3/5] lightnvm: add sync support for submit_io

Allow read and write I/Os to be issued synchronous. Users include the
LightNVM core to implement system block support and similar.

Signed-off-by: Matias Bjørling <[email protected]>
---
drivers/nvme/host/lightnvm.c | 7 +++++++
include/linux/lightnvm.h | 3 +++
2 files changed, 10 insertions(+)

diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 15f2acb..1454e53 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -496,6 +496,13 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
rq->cmd_len = sizeof(struct nvme_nvm_command);
rq->special = (void *)0;

+ if (rqd->flags & NVM_IO_F_SYNC) {
+ int err = blk_execute_rq(q, NULL, rq, 0);
+ kfree(cmd);
+ blk_mq_free_request(rq);
+ return err;
+ }
+
rq->end_io_data = rqd;

blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 2fd6871..770278a 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -60,6 +60,9 @@ enum {
NVM_BLK_T_BAD = 0x1,
NVM_BLK_T_DEV = 0x2,
NVM_BLK_T_HOST = 0x4,
+
+ /* NVM Request Flags */
+ NVM_IO_F_SYNC = 0x1,
};

struct nvm_id_group {
--
2.1.4

2015-12-14 13:18:30

by Matias Bjørling

[permalink] [raw]

Subject: [PATCH RFC 4/5] lightnvm: introduce nvm_submit_ppa

Internal logic for both core and media managers, does not have a
backing bio for issuing I/Os. Introduce nvm_submit_ppa to allow raw
I/Os to be submitted to the underlying device driver.

The function request the device, ppa, data buffer and its length and
will submit the I/O synchronously to the device. The return value may
therefore be used to detect any errors regarding the issued I/O.

Signed-off-by: Matias Bjørling <[email protected]>
---
drivers/lightnvm/core.c | 31 +++++++++++++++++++++++++++++++
include/linux/lightnvm.h | 1 +
2 files changed, 32 insertions(+)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 081b0f5..77a9907 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -288,6 +288,37 @@ int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr ppa)
}
EXPORT_SYMBOL(nvm_erase_ppa);

+int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr ppa, int opcode,
+ void *buf, int len)
+{
+ struct nvm_rq rqd;
+ struct bio *bio;
+ int ret;
+
+ bio = bio_map_kern(dev->q, buf, len, GFP_KERNEL);
+ if (IS_ERR_OR_NULL(bio))
+ return -ENOMEM;
+
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+ ret = nvm_set_rqd_ppalist(dev, &rqd, &ppa, 1);
+ if (ret) {
+ bio_put(bio);
+ return ret;
+ }
+
+ rqd.opcode = opcode;
+ rqd.flags = NVM_IO_F_SYNC;
+ rqd.bio = bio;
+ nvm_generic_to_addr_mode(dev, &rqd);
+
+ ret = dev->ops->submit_io(dev, &rqd);
+
+ nvm_free_rqd_ppalist(dev, &rqd);
+
+ return ret;
+}
+EXPORT_SYMBOL(nvm_submit_ppa);
+
static int nvm_core_init(struct nvm_dev *dev)
{
struct nvm_id *id = &dev->identity;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 770278a..0017d55 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -437,6 +437,7 @@ extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr);
extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *);
+extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr, int, void *, int);
#else /* CONFIG_NVM */
struct nvm_dev_ops;

--
2.1.4

2015-12-14 13:17:56

by Matias Bjørling

[permalink] [raw]

Subject: [PATCH RFC 5/5] lightnvm: core on-disk initialization

An Open-Channel SSD shall be initialized before use. To initialize, we
define an on-disk format, that keeps a small set of metadata to bring up
the media manager on top of the device.

The initial step is introduced to allow a user to format the disks for a
given media manager. During format, a system block is stored on one to
three separate luns on the device. Each lun has the system block
duplicated. During initialization, the system block can be retrieved and
the appropriate media manager can initialized.

The on-disk format currently covers (struct nvm_system_block):

- Magic value "NVMS".
- Monotonic increasing sequence number.
- The physical block erase count.
- Version of the system block format.
- Media manager type.
- Media manager superblock physical address.

The interface provides three functions to manage the system block:

int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *)
int nvm_get_sysblock(struct nvm *dev, struct nvm_sb_info *)
int nvm_update_sysblock(struct nvm *dev, struct nvm_sb_info *)

Each implement a part of the logic to manage the system block. The
initialization creates the first system blocks and mark them on the
device. Get retrieves the latest system block by scanning all pages in
the associated system blocks. The update sysblock writes new metadata
and allocates new block if necessary.

Signed-off-by: Matias Bjørling <[email protected]>
---
drivers/lightnvm/Makefile | 2 +-
drivers/lightnvm/core.c | 1 +
drivers/lightnvm/sysblk.c | 524 ++++++++++++++++++++++++++++++++++++++++++++++
include/linux/lightnvm.h | 29 +++
4 files changed, 555 insertions(+), 1 deletion(-)
create mode 100644 drivers/lightnvm/sysblk.c

diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index 7e0f42a..a7a0a22 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -2,6 +2,6 @@
# Makefile for Open-Channel SSDs.
#

-obj-$(CONFIG_NVM) := core.o
+obj-$(CONFIG_NVM) := core.o sysblk.o
obj-$(CONFIG_NVM_GENNVM) += gennvm.o
obj-$(CONFIG_NVM_RRPC) += rrpc.o
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 77a9907..c22ca42 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -365,6 +365,7 @@ static int nvm_core_init(struct nvm_dev *dev)
dev->nr_chnls;
dev->total_pages = dev->total_blocks * dev->pgs_per_blk;
INIT_LIST_HEAD(&dev->online_targets);
+ mutex_init(&dev->mlock);

return 0;
}
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
new file mode 100644
index 0000000..0e7808d
--- /dev/null
+++ b/drivers/lightnvm/sysblk.c
@@ -0,0 +1,524 @@
+/*
+ * Copyright (C) 2015 Matias Bjorling. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/lightnvm.h>
+
+#define MAX_SYSBLKS 3 /* remember to update mapping scheme on change */
+#define MAX_BLKS_PR_SYSBLK 2 /* 2 blks with 256 pages and 3000 erases
+ * enables ~1.5M updates per sysblk unit
+ */
+
+struct sysblk_scan {
+ /* A row is a collection of flash blocks for a system block. */
+ int nr_rows;
+ int row;
+ int act_blk[MAX_SYSBLKS];
+
+ int nr_ppas;
+ struct ppa_addr ppas[MAX_SYSBLKS * MAX_BLKS_PR_SYSBLK];/* all sysblks */
+};
+
+static inline int scan_ppa_idx(struct sysblk_scan *s, int row, int blkid)
+{
+ return (row * s->nr_rows) + blkid;
+}
+
+void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb)
+{
+ info->seqnr = be32_to_cpu(sb->seqnr);
+ info->erase_cnt = be32_to_cpu(sb->erase_cnt);
+ info->version = be16_to_cpu(sb->version);
+ info->mmtype = be16_to_cpu(sb->mmtype);
+ info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa);
+}
+
+void nvm_cpu_to_sysblk(struct nvm_system_block *sb, struct nvm_sb_info *info)
+{
+ sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC);
+ sb->seqnr = cpu_to_be32(info->seqnr);
+ sb->erase_cnt = cpu_to_be32(info->erase_cnt);
+ sb->version = cpu_to_be16(info->version);
+ sb->mmtype = cpu_to_be16(info->mmtype);
+ sb->fs_ppa = cpu_to_be64(info->fs_ppa.ppa);
+}
+
+static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas)
+{
+ int nr_rows = min_t(int, MAX_SYSBLKS, dev->nr_chnls);
+ int i;
+
+ for (i = 0; i < nr_rows; i++)
+ sysblk_ppas[i].ppa = 0;
+
+ /* if possible, place sysblk at first channel, middle channel and last
+ * channel of the device. If not, create only one or two sys blocks
+ */
+ switch (dev->nr_chnls) {
+ case 2:
+ sysblk_ppas[1].g.ch = 1;
+ /* fall-through */
+ case 1:
+ sysblk_ppas[0].g.ch = 0;
+ break;
+ default:
+ sysblk_ppas[0].g.ch = 0;
+ sysblk_ppas[1].g.ch = dev->nr_chnls / 2;
+ sysblk_ppas[2].g.ch = dev->nr_chnls - 1;
+ break;
+ }
+
+ return nr_rows;
+}
+
+void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s,
+ struct ppa_addr *sysblk_ppas)
+{
+ memset(s, 0, sizeof(struct sysblk_scan));
+ s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas);
+}
+
+static int sysblk_get_host_blks(struct ppa_addr ppa, int nr_blks, u8 *blks,
+ void *private)
+{
+ struct sysblk_scan *s = private;
+ int i, nr_sysblk = 0;
+
+ for (i = 0; i < nr_blks; i++) {
+ if (blks[i] != NVM_BLK_T_HOST)
+ continue;
+
+ if (s->nr_ppas == MAX_BLKS_PR_SYSBLK * MAX_SYSBLKS) {
+ pr_err("nvm: too many host blks\n");
+ return -EINVAL;
+ }
+
+ ppa.g.blk = i;
+
+ s->ppas[scan_ppa_idx(s, s->row, nr_sysblk)] = ppa;
+ s->nr_ppas++;
+ nr_sysblk++;
+ }
+
+ return 0;
+}
+
+static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s,
+ struct ppa_addr *row_ppas)
+{
+ struct ppa_addr dev_ppa;
+ int i, ret;
+
+ s->nr_ppas = 0;
+
+ for (i = 0; i < s->nr_rows; i++) {
+ dev_ppa = generic_to_dev_addr(dev, row_ppas[i]);
+ s->row = i;
+
+ ret = dev->ops->get_bb_tbl(dev, dev_ppa, dev->blks_per_lun,
+ sysblk_get_host_blks, s);
+ if (ret) {
+ pr_err("nvm: failed bb tbl for ch%u lun%u\n",
+ row_ppas[i].g.ch,
+ row_ppas[i].g.blk);
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * scans a block for latest sysblk.
+ * Returns:
+ * 0 - newer sysblk not found. PPA is updated to latest page.
+ * 1 - newer sysblk found and stored in *cur. PPA is updated to
+ * next valid page.
+ * <0- error.
+ */
+static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa,
+ struct nvm_system_block *sblk)
+{
+ struct nvm_system_block *cur;
+ int pg, cursz, ret, found = 0;
+
+ /* the full buffer for a flash page is allocated. Only the first of it
+ * contains the system block information
+ */
+ cursz = dev->sec_size * dev->sec_per_pg * dev->nr_planes;
+ cur = kmalloc(cursz, GFP_KERNEL);
+ if (!cur)
+ return -ENOMEM;
+
+ /* perform linear scan through the block */
+ for (pg = 0; pg < dev->pgs_per_blk; pg++) {
+
+ ret = nvm_submit_ppa(dev, *ppa, NVM_OP_PREAD, cur, cursz);
+ if (ret)
+ break; /* if we can't read a page, continue to the
+ * next blk
+ */
+
+ if (be32_to_cpu(cur->magic) != NVM_SYSBLK_MAGIC) {
+ pr_debug("nvm: scan break at ch: %u lun: %u blk:%u pg:%u\n",
+ (*ppa).g.ch,
+ (*ppa).g.lun,
+ (*ppa).g.blk,
+ (*ppa).g.pg);
+ break; /* last valid page already found */
+ }
+
+ if (be32_to_cpu(cur->seqnr) < be32_to_cpu(sblk->seqnr))
+ continue;
+
+ ppa->g.pg = pg;
+ memcpy(sblk, cur, sizeof(struct nvm_system_block));
+ found = 1;
+ }
+
+ kfree(cur);
+
+ return found;
+}
+
+static int sysblk_get_free_blks(struct ppa_addr ppa, int nr_blks, u8 *blks,
+ void *private)
+{
+ struct sysblk_scan *s = private;
+ int i, blkid = 0;
+
+ for (i = 0; i < nr_blks; i++) {
+ if (blks[i] == NVM_BLK_T_HOST) {
+ pr_err("nvm: device already initialized\n");
+ return -EEXIST;
+ }
+
+ if (blks[i] != NVM_BLK_T_FREE)
+ continue;
+
+ s->ppas[scan_ppa_idx(s, s->row, blkid)].g.blk = i;
+ s->nr_ppas++;
+ blkid++;
+
+ if (blkid > MAX_BLKS_PR_SYSBLK - 1)
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int nvm_mark_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s,
+ struct ppa_addr *ppas)
+{
+ struct nvm_rq rqd;
+ struct ppa_addr dev_ppa;
+ int i, ret;
+
+ for (i = 0; i < s->nr_rows; i++) {
+ dev_ppa = generic_to_dev_addr(dev, ppas[i]);
+
+ s->row = i;
+ ret = dev->ops->get_bb_tbl(dev, dev_ppa, dev->blks_per_lun,
+ sysblk_get_free_blks, s);
+ if (ret) {
+ pr_err("nvm: sysblk failed bb tbl for ch%u lun%u\n",
+ ppas[i].g.ch,
+ ppas[i].g.blk);
+ return -EINVAL;
+ }
+ }
+
+ if (s->nr_ppas > dev->ops->max_phys_sect) {
+ pr_err("nvm: unable to update all sysblocks atomically\n");
+ return -EINVAL;
+ }
+
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas);
+ nvm_generic_to_addr_mode(dev, &rqd);
+
+ ret = dev->ops->set_bb_tbl(dev, &rqd, NVM_BLK_T_HOST);
+ nvm_free_rqd_ppalist(dev, &rqd);
+ if (ret) {
+ pr_err("nvm: sysblk failed bb mark\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
+ struct sysblk_scan *s)
+{
+ struct nvm_system_block nvmsb;
+ void *buf;
+ int i, ret, bufsz;
+
+ nvm_cpu_to_sysblk(&nvmsb, info);
+
+ /* buffer for flash page */
+ bufsz = dev->sec_size * dev->sec_per_pg * dev->nr_planes;
+ buf = kmalloc(bufsz, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ memcpy(buf, &nvmsb, sizeof(struct nvm_system_block));
+
+ /* Write and verify */
+ for (i = 0; i < s->nr_rows; i++) {
+ struct ppa_addr ppa =
+ s->ppas[scan_ppa_idx(s, i, s->act_blk[i])];
+
+ pr_debug("nvm: writing sysblk to ch:%u lun:%u blk:%u pg:%u\n",
+ ppa.g.ch, ppa.g.lun,
+ ppa.g.blk, ppa.g.pg);
+
+ ret = nvm_submit_ppa(dev, ppa, NVM_OP_PWRITE, buf, bufsz);
+ if (ret) {
+ pr_err("nvm: sysblk failed program [ch%u lun%u blk%u]\n",
+ ppa.g.ch,
+ ppa.g.lun,
+ ppa.g.blk);
+ break;
+ }
+
+ ret = nvm_submit_ppa(dev, ppa, NVM_OP_PREAD, buf, bufsz);
+ if (ret) {
+ pr_err("nvm: sysblk failed read [ch%u lun%u blk%u]\n",
+ ppa.g.ch,
+ ppa.g.lun,
+ ppa.g.blk);
+ break;
+ }
+
+ if (memcmp(buf, &nvmsb, sizeof(struct nvm_system_block))) {
+ pr_err("nvm: sysblk failed verify [ch%u lun%u blk%u]\n",
+ ppa.g.ch,
+ ppa.g.lun,
+ ppa.g.blk);
+ break;
+ }
+ }
+
+ kfree(buf);
+
+ return ret;
+}
+
+static int nvm_prepare_new_sysblks(struct nvm_dev *dev, struct sysblk_scan *s)
+{
+ int i, ret;
+ unsigned long nxt_blk;
+ struct ppa_addr *ppa;
+
+ for (i = 0; i < s->nr_rows; i++) {
+ nxt_blk = (s->act_blk[i] + 1) % MAX_BLKS_PR_SYSBLK;
+ ppa = &s->ppas[scan_ppa_idx(s, i, nxt_blk)];
+ ppa->g.pg = 0;
+
+ ret = nvm_erase_ppa(dev, *ppa);
+ if (ret)
+ return ret;
+
+ s->act_blk[i] = nxt_blk;
+ }
+
+ return 0;
+}
+
+int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
+{
+ struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
+ struct sysblk_scan s;
+ struct nvm_system_block *cur;
+ int i, j, found = 0;
+ int ret = -ENOMEM;
+
+ /*
+ * 1. setup sysblk locations
+ * 2. get bad block list
+ * 3. filter on host-specific (type 3)
+ * 4. iterate through all and find the highest seq nr.
+ * 5. return superblock information
+ */
+
+ if (!dev->ops->get_bb_tbl)
+ return -EINVAL;
+
+ nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
+
+ mutex_lock(&dev->mlock);
+ ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas);
+ if (ret)
+ goto err_sysblk;
+
+ cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL);
+ if (!cur)
+ goto err_sysblk;
+
+ /* find the latest block across all sysblocks */
+ for (i = 0; i < s.nr_rows; i++) {
+ for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) {
+ struct ppa_addr ppa = s.ppas[scan_ppa_idx(&s, i, j)];
+
+ ret = nvm_scan_block(dev, &ppa, cur);
+ if (ret > 0)
+ found = 1;
+ else if (ret < 0)
+ break;
+ }
+ }
+
+ nvm_sysblk_to_cpu(info, cur);
+
+ kfree(cur);
+err_sysblk:
+ mutex_unlock(&dev->mlock);
+
+ if (found)
+ return 1;
+ return ret;
+}
+
+int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new)
+{
+ /* 1. for each latest superblock
+ * 2. if room
+ * a. write new flash page entry with the updated information
+ * 3. if no room
+ * a. find next available block on lun (linear search)
+ * if none, continue to next lun
+ * if none at all, report error. also report that it wasn't
+ * possible to write to all superblocks.
+ * c. write data to block.
+ */
+ struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
+ struct sysblk_scan s;
+ struct nvm_system_block *cur;
+ int i, j, ppaidx, found = 0;
+ int ret = -ENOMEM;
+
+ if (!dev->ops->get_bb_tbl)
+ return -EINVAL;
+
+ nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
+
+ mutex_lock(&dev->mlock);
+ ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas);
+ if (ret)
+ goto err_sysblk;
+
+ cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL);
+ if (!cur)
+ goto err_sysblk;
+
+ /* Get the latest sysblk for each sysblk row */
+ for (i = 0; i < s.nr_rows; i++) {
+ found = 0;
+ for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) {
+ ppaidx = scan_ppa_idx(&s, i, j);
+ ret = nvm_scan_block(dev, &s.ppas[ppaidx], cur);
+ if (ret > 0) {
+ s.act_blk[i] = j;
+ found = 1;
+ } else if (ret < 0)
+ break;
+ }
+ }
+
+ if (!found) {
+ pr_err("nvm: no valid sysblks found to update\n");
+ ret = -EINVAL;
+ goto err_cur;
+ }
+
+ /*
+ * All sysblocks found. Check that they have same page id in their flash
+ * blocks
+ */
+ for (i = 1; i < s.nr_rows; i++) {
+ struct ppa_addr l = s.ppas[scan_ppa_idx(&s, 0, s.act_blk[0])];
+ struct ppa_addr r = s.ppas[scan_ppa_idx(&s, i, s.act_blk[i])];
+
+ if (l.g.pg != r.g.pg) {
+ pr_err("nvm: sysblks not on same page. Previous update failed.\n");
+ ret = -EINVAL;
+ goto err_cur;
+ }
+ }
+
+ /*
+ * Check that there haven't been another update to the seqnr since we
+ * began
+ */
+ if ((new->seqnr - 1) != be32_to_cpu(cur->seqnr)) {
+ pr_err("nvm: seq is not sequential\n");
+ ret = -EINVAL;
+ goto err_cur;
+ }
+
+ /*
+ * When all pages in a block has been written, a new block is selected
+ * and writing is performed on the new block.
+ */
+ if (s.ppas[scan_ppa_idx(&s, 0, s.act_blk[0])].g.pg ==
+ dev->pgs_per_blk - 1) {
+ ret = nvm_prepare_new_sysblks(dev, &s);
+ if (ret)
+ goto err_cur;
+ }
+
+ ret = nvm_write_and_verify(dev, new, &s);
+err_cur:
+ kfree(cur);
+err_sysblk:
+ mutex_unlock(&dev->mlock);
+
+ return ret;
+}
+
+int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
+{
+ struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
+ struct sysblk_scan s;
+ int ret;
+
+ /*
+ * 1. select master blocks and select first available blks
+ * 2. get bad block list
+ * 3. mark MAX_SYSBLKS block as host-based device allocated.
+ * 4. write and verify data to block
+ */
+
+ if (!dev->ops->get_bb_tbl)
+ return -EINVAL;
+
+ /* Index all sysblocks and mark them as host-driven */
+ nvm_setup_sysblk_scan(dev, &s, sysblk_ppas);
+
+ mutex_lock(&dev->mlock);
+ ret = nvm_mark_all_sysblks(dev, &s, sysblk_ppas);
+ if (ret)
+ goto err_mark;
+
+ /* Write to the first block of each row */
+ ret = nvm_write_and_verify(dev, info, &s);
+err_mark:
+ mutex_unlock(&dev->mlock);
+ return ret;
+}
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 0017d55..083c4b9 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -283,6 +283,8 @@ struct nvm_dev {
/* Backend device */
struct request_queue *q;
char name[DISK_NAME_LEN];
+
+ struct mutex mlock;
};

static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
@@ -438,6 +440,33 @@ extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr);
extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *);
extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr, int, void *, int);
+
+/* sysblk.c */
+#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */
+
+/* system block on disk representation */
+struct nvm_system_block {
+ __be32 magic; /* magic signature */
+ __be32 seqnr; /* sequence number */
+ __be32 erase_cnt; /* erase count */
+ __be16 version; /* version number */
+ __be16 mmtype; /* media manager type */
+ __be64 fs_ppa; /* PPA for media manager
+ * superblock */
+};
+
+/* system block cpu representation */
+struct nvm_sb_info {
+ unsigned long seqnr;
+ unsigned long erase_cnt;
+ unsigned int version;
+ unsigned int mmtype;
+ struct ppa_addr fs_ppa;
+};
+
+extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *);
+extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *);
+extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *);
#else /* CONFIG_NVM */
struct nvm_dev_ops;

--
2.1.4

2015-12-15 12:34:38

by Christoph Hellwig

[permalink] [raw]

Subject: Re: [PATCH RFC 3/5] lightnvm: add sync support for submit_io

On Mon, Dec 14, 2015 at 02:17:05PM +0100, Matias Bj??rling wrote:
> Allow read and write I/Os to be issued synchronous. Users include the
> LightNVM core to implement system block support and similar.

I think the right way to implement this is to wait in the caller,
e.g. set your end_io handler to a trivial one that just does a complete
and add a completion that you can wait on in the caller.

2015-12-15 14:10:52

by Matias Bjørling

[permalink] [raw]

Subject: Re: [PATCH RFC 3/5] lightnvm: add sync support for submit_io

On 12/15/2015 01:34 PM, Christoph Hellwig wrote:
> On Mon, Dec 14, 2015 at 02:17:05PM +0100, Matias Bj??rling wrote:
>> Allow read and write I/Os to be issued synchronous. Users include the
>> LightNVM core to implement system block support and similar.
>
> I think the right way to implement this is to wait in the caller,
> e.g. set your end_io handler to a trivial one that just does a complete
> and add a completion that you can wait on in the caller.
>

Thanks Christoph. I'll change it.