This patch set adds a block layout driver to the pNFS client. It passes
Connectathon tests and is bisectable. It requires an updated version of
nfs-utils, and patches for that have been sent separately to the nfs-utils
maintainer.
This patch set is also available on the for-trond branch of the git repo at
git://citi.umich.edu/projects/linux-pnfs-blk.git
This is version 2 of this patch set. Changes since version 1:
NFS41: Drop lseg ref before fallthru to MDS
SQUASHME: pnfsblock: get rid of vmap and deviceid->area structure
SQUASHME: pnfsblock: define module alias
SQUASHME: pnfsblock: bl_find_get_extent optimization: mv break clause to end of loop
SQUASHME: pnfsblock: test debug bit once for multiple dprintks
SQUASHME: pnfsblock: typo
SQUASHME: pnfsblock: get rid of unused leftovers from device mapping removal
Andy Adamson (2):
pnfs: GETDEVICELIST
pnfs: cleanup_layoutcommit
Benny Halevy (2):
pnfs: add set-clear layoutdriver interface
pnfsblock: use pageio_ops api
Fred (1):
pnfsblock: bl_find_get_extent
Fred Isaman (14):
pnfs: ask for layout_blksize and save it in nfs_server
pnfsblock: add blocklayout Kconfig option, Makefile, and stubs
pnfsblock: basic extent code
pnfsblock: lseg alloc and free
pnfsblock: merge extents
pnfsblock: call and parse getdevicelist
pnfsblock: xdr decode pnfs_block_layout4
pnfsblock: add extent manipulation functions
pnfsblock: merge rw extents
pnfsblock: encode_layoutcommit
pnfsblock: cleanup_layoutcommit
pnfsblock: bl_read_pagelist
pnfsblock: bl_write_pagelist
pnfsblock: note written INVAL areas for layoutcommit
Jim Rees (2):
pnfsblock: add device operations
pnfsblock: remove device operations
Peng Tao (4):
NFS41: Let layoutcommit handle multiple segments
NFS41: save layoutcommit cred after first successful layoutget
pnfsblock: write_pagelist handle zero invalid extents
NFS41: Drop lseg ref before fallthru to MDS
fs/nfs/Kconfig | 8 +-
fs/nfs/Makefile | 1 +
fs/nfs/blocklayout/Makefile | 5 +
fs/nfs/blocklayout/blocklayout.c | 1048 +++++++++++++++++++++++++++++++++++
fs/nfs/blocklayout/blocklayout.h | 208 +++++++
fs/nfs/blocklayout/blocklayoutdev.c | 410 ++++++++++++++
fs/nfs/blocklayout/blocklayoutdm.c | 111 ++++
fs/nfs/blocklayout/extents.c | 943 +++++++++++++++++++++++++++++++
fs/nfs/client.c | 11 +-
fs/nfs/nfs4_fs.h | 2 +-
fs/nfs/nfs4filelayout.c | 2 +-
fs/nfs/nfs4proc.c | 62 ++-
fs/nfs/nfs4xdr.c | 230 +++++++-
fs/nfs/pnfs.c | 86 ++-
fs/nfs/pnfs.h | 30 +-
include/linux/nfs.h | 2 +
include/linux/nfs4.h | 1 +
include/linux/nfs_fs_sb.h | 4 +-
include/linux/nfs_xdr.h | 17 +-
19 files changed, 3118 insertions(+), 63 deletions(-)
create mode 100644 fs/nfs/blocklayout/Makefile
create mode 100644 fs/nfs/blocklayout/blocklayout.c
create mode 100644 fs/nfs/blocklayout/blocklayout.h
create mode 100644 fs/nfs/blocklayout/blocklayoutdev.c
create mode 100644 fs/nfs/blocklayout/blocklayoutdm.c
create mode 100644 fs/nfs/blocklayout/extents.c
--
1.7.4.1
From: Fred Isaman <[email protected]>
In blocklayout driver. There are two things happening
while layoutcommit/cleanup.
1. the modified extents are encoded.
2. On cleanup the extents are put back on the layout rw
extents list, for reads.
In the new system where actual xdr encoding is done in
encode_layoutcommit() directly into xdr buffer, these are
the new commit stages:
1. On setup_layoutcommit, the range is adjusted as before
and a structure is allocated for communication with
bl_encode_layoutcommit && bl_cleanup_layoutcommit
(Generic layer provides a void-star to hang it on)
2. bl_encode_layoutcommit is called to do the actual
encoding directly into xdr. The commit-extent-list is not
freed and is stored on above structure.
FIXME: The code is not yet converted to the new XDR cleanup
3. On cleanup the commit-extent-list is put back by a call
to set_to_rw() as before, but with no need for XDR decoding
of the list as before. And the commit-extent-list is freed.
Finally allocated structure is freed.
[pnfsblock: get rid of deprecated xdr macros]
Signed-off-by: Jim Rees <[email protected]>
[pnfsblock: SQUASHME: port block layout code]
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
[blocklayout: encode_layoutcommit implementation]
Signed-off-by: Boaz Harrosh <[email protected]>
[pnfsblock: fix bug setting up layoutcommit.]
Signed-off-by: Tao Guo <[email protected]>
[pnfsblock: prevent commit list corruption]
[pnfsblock: fix layoutcommit with an empty opaque]
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 2 +
fs/nfs/blocklayout/blocklayout.h | 12 +++
fs/nfs/blocklayout/extents.c | 176 ++++++++++++++++++++++++++++----------
3 files changed, 146 insertions(+), 44 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index c4b584b..36fd19c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -160,6 +160,8 @@ static void
bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *arg)
{
+ dprintk("%s enter\n", __func__);
+ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
}
static void
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 37e5989..edbe6a9 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -91,6 +91,15 @@ struct pnfs_block_extent {
struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
};
+/* Shortened extent used by LAYOUTCOMMIT */
+struct pnfs_block_short_extent {
+ struct list_head bse_node;
+ struct nfs4_deviceid bse_devid; /* STUB - removable??? */
+ struct block_device *bse_mdev;
+ sector_t bse_f_offset; /* the starting offset in the file */
+ sector_t bse_length; /* the size of the extent */
+};
+
static inline void
INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
{
@@ -185,6 +194,9 @@ void bl_put_extent(struct pnfs_block_extent *be);
struct pnfs_block_extent *alloc_extent(void);
struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be);
int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect);
+int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *arg);
int bl_add_merge_extent(struct pnfs_block_layout *bl,
struct pnfs_block_extent *new);
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index ed62cd8..462a270 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -286,6 +286,49 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
return -ENOMEM;
}
+/* Marks sectors in [offest, offset+length) as having been written to disk.
+ * All lengths should be block aligned.
+ */
+int mark_written_sectors(struct pnfs_inval_markings *marks,
+ sector_t offset, sector_t length)
+{
+ int status;
+
+ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
+ (u64)offset, (u64)length);
+ spin_lock(&marks->im_lock);
+ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
+ spin_unlock(&marks->im_lock);
+ return status;
+}
+
+static void print_short_extent(struct pnfs_block_short_extent *be)
+{
+ dprintk("PRINT SHORT EXTENT extent %p\n", be);
+ if (be) {
+ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
+ dprintk(" be_length %llu\n", (u64)be->bse_length);
+ }
+}
+
+void print_clist(struct list_head *list, unsigned int count)
+{
+ struct pnfs_block_short_extent *be;
+ unsigned int i = 0;
+
+ ifdebug(FACILITY) {
+ printk(KERN_DEBUG "****************\n");
+ printk(KERN_DEBUG "Extent list looks like:\n");
+ list_for_each_entry(be, list, bse_node) {
+ i++;
+ print_short_extent(be);
+ }
+ if (i != count)
+ printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
+ printk(KERN_DEBUG "****************\n");
+ }
+}
+
static void print_bl_extent(struct pnfs_block_extent *be)
{
dprintk("PRINT EXTENT extent %p\n", be);
@@ -386,65 +429,67 @@ bl_add_merge_extent(struct pnfs_block_layout *bl,
/* Scan for proper place to insert, extending new to the left
* as much as possible.
*/
- list_for_each_entry_safe(be, tmp, list, be_node) {
- if (new->be_f_offset < be->be_f_offset)
+ list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
+ if (new->be_f_offset >= be->be_f_offset + be->be_length)
break;
- if (end <= be->be_f_offset + be->be_length) {
- /* new is a subset of existing be*/
+ if (new->be_f_offset >= be->be_f_offset) {
+ if (end <= be->be_f_offset + be->be_length) {
+ /* new is a subset of existing be*/
+ if (extents_consistent(be, new)) {
+ dprintk("%s: new is subset, ignoring\n",
+ __func__);
+ bl_put_extent(new);
+ return 0;
+ } else {
+ goto out_err;
+ }
+ } else {
+ /* |<-- be -->|
+ * |<-- new -->| */
+ if (extents_consistent(be, new)) {
+ /* extend new to fully replace be */
+ new->be_length += new->be_f_offset -
+ be->be_f_offset;
+ new->be_f_offset = be->be_f_offset;
+ new->be_v_offset = be->be_v_offset;
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ } else {
+ goto out_err;
+ }
+ }
+ } else if (end >= be->be_f_offset + be->be_length) {
+ /* new extent overlap existing be */
if (extents_consistent(be, new)) {
- dprintk("%s: new is subset, ignoring\n",
- __func__);
- bl_put_extent(new);
- return 0;
- } else
+ /* extend new to fully replace be */
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ } else {
goto out_err;
- } else if (new->be_f_offset <=
- be->be_f_offset + be->be_length) {
- /* new overlaps or abuts existing be */
- if (extents_consistent(be, new)) {
+ }
+ } else if (end > be->be_f_offset) {
+ /* |<-- be -->|
+ *|<-- new -->| */
+ if (extents_consistent(new, be)) {
/* extend new to fully replace be */
- new->be_length += new->be_f_offset -
- be->be_f_offset;
- new->be_f_offset = be->be_f_offset;
- new->be_v_offset = be->be_v_offset;
+ new->be_length += be->be_f_offset + be->be_length -
+ new->be_f_offset - new->be_length;
dprintk("%s: removing %p\n", __func__, be);
list_del(&be->be_node);
bl_put_extent(be);
- } else if (new->be_f_offset !=
- be->be_f_offset + be->be_length)
+ } else {
goto out_err;
+ }
}
}
/* Note that if we never hit the above break, be will not point to a
* valid extent. However, in that case &be->be_node==list.
*/
- list_add_tail(&new->be_node, &be->be_node);
+ list_add(&new->be_node, &be->be_node);
dprintk("%s: inserting new\n", __func__);
print_elist(list);
- /* Scan forward for overlaps. If we find any, extend new and
- * remove the overlapped extent.
- */
- be = list_prepare_entry(new, list, be_node);
- list_for_each_entry_safe_continue(be, tmp, list, be_node) {
- if (end < be->be_f_offset)
- break;
- /* new overlaps or abuts existing be */
- if (extents_consistent(be, new)) {
- if (end < be->be_f_offset + be->be_length) {
- /* extend new to fully cover be */
- end = be->be_f_offset + be->be_length;
- new->be_length = end - new->be_f_offset;
- }
- dprintk("%s: removing %p\n", __func__, be);
- list_del(&be->be_node);
- bl_put_extent(be);
- } else if (end != be->be_f_offset) {
- list_del(&new->be_node);
- goto out_err;
- }
- }
- dprintk("%s: after merging\n", __func__);
- print_elist(list);
/* STUB - The per-list consistency checks have all been done,
* should now check cross-list consistency.
*/
@@ -502,6 +547,49 @@ bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
return ret;
}
+int
+encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *arg)
+{
+ struct pnfs_block_short_extent *lce, *save;
+ unsigned int count = 0;
+ __be32 *p, *xdr_start;
+
+ dprintk("%s enter\n", __func__);
+ /* BUG - creation of bl_commit is buggy - need to wait for
+ * entire block to be marked WRITTEN before it can be added.
+ */
+ spin_lock(&bl->bl_ext_lock);
+ /* Want to adjust for possible truncate */
+ /* We now want to adjust argument range */
+
+ /* XDR encode the ranges found */
+ xdr_start = xdr_reserve_space(xdr, 8);
+ if (!xdr_start)
+ goto out;
+ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
+ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
+ if (!p)
+ break;
+ p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
+ p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, 0LL);
+ *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+ list_del(&lce->bse_node);
+ list_add_tail(&lce->bse_node, &bl->bl_committing);
+ bl->bl_count--;
+ count++;
+ }
+ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
+ xdr_start[1] = cpu_to_be32(count);
+out:
+ spin_unlock(&bl->bl_ext_lock);
+ dprintk("%s found %i ranges\n", __func__, count);
+ return 0;
+}
+
/* Helper function to set_to_rw that initialize a new extent */
static void
_prep_new_extent(struct pnfs_block_extent *new,
--
1.7.4.1
From: Benny Halevy <[email protected]>
To allow layout driver to issue getdevicelist at mount time, and clean up
at umount time.
[fixup non NFS_V4_1 set_pnfs_layoutdriver definition]
[pnfs: pass mntfh down the init_pnfs path]
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/client.c | 8 +++++---
fs/nfs/pnfs.c | 16 ++++++++++++++--
fs/nfs/pnfs.h | 8 ++++++--
3 files changed, 25 insertions(+), 7 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 19ea7d9..a9b1848 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -904,7 +904,9 @@ error:
/*
* Load up the server record from information gained in an fsinfo record
*/
-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo)
+static void nfs_server_set_fsinfo(struct nfs_server *server,
+ struct nfs_fh *mntfh,
+ struct nfs_fsinfo *fsinfo)
{
unsigned long max_rpc_payload;
@@ -934,7 +936,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
server->wsize = NFS_MAX_FILE_IO_SIZE;
server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- set_pnfs_layoutdriver(server, fsinfo->layouttype);
+ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
@@ -980,7 +982,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
if (error < 0)
goto out_error;
- nfs_server_set_fsinfo(server, &fsinfo);
+ nfs_server_set_fsinfo(server, mntfh, &fsinfo);
/* Get some general file system info */
if (server->namelen == 0) {
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 38e5508..8e72724 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -76,8 +76,11 @@ find_pnfs_driver(u32 id)
void
unset_pnfs_layoutdriver(struct nfs_server *nfss)
{
- if (nfss->pnfs_curr_ld)
+ if (nfss->pnfs_curr_ld) {
+ if (nfss->pnfs_curr_ld->clear_layoutdriver)
+ nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
module_put(nfss->pnfs_curr_ld->owner);
+ }
nfss->pnfs_curr_ld = NULL;
}
@@ -88,7 +91,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)
* @id layout type. Zero (illegal layout type) indicates pNFS not in use.
*/
void
-set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
+set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
+ u32 id)
{
struct pnfs_layoutdriver_type *ld_type = NULL;
@@ -115,6 +119,14 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
goto out_no_driver;
}
server->pnfs_curr_ld = ld_type;
+ if (ld_type->set_layoutdriver
+ && ld_type->set_layoutdriver(server, mntfh)) {
+ printk(KERN_ERR
+ "%s: Error initializing mount point for layout driver %u.\n",
+ __func__, id);
+ module_put(ld_type->owner);
+ goto out_no_driver;
+ }
dprintk("%s: pNFS module for %u set\n", __func__, id);
return;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index ffea314..23d8267 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -80,6 +80,9 @@ struct pnfs_layoutdriver_type {
struct module *owner;
unsigned flags;
+ int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
+ int (*clear_layoutdriver) (struct nfs_server *);
+
struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
void (*free_layout_hdr) (struct pnfs_layout_hdr *);
@@ -165,7 +168,7 @@ void put_lseg(struct pnfs_layout_segment *lseg);
bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int);
-void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
void unset_pnfs_layoutdriver(struct nfs_server *);
void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
@@ -372,7 +375,8 @@ pnfs_roc_drain(struct inode *ino, u32 *barrier)
return false;
}
-static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id)
+static inline void set_pnfs_layoutdriver(struct nfs_server *s,
+ const struct nfs_fh *mntfh, u32 id);
{
}
--
1.7.4.1
From: Fred Isaman <[email protected]>
Call GETDEVICELIST during mount, then call and parse GETDEVICEINFO
for each device returned.
[pnfsblock: get rid of deprecated xdr macros]
Signed-off-by: Jim Rees <[email protected]>
[pnfsblock: fix pnfs_deviceid references]
Signed-off-by: Fred Isaman <[email protected]>
[pnfsblock: fix print format warnings for sector_t and size_t]
[pnfs-block: #include <linux/vmalloc.h>]
[pnfsblock: no PNFS_NFS_SERVER]
Signed-off-by: Benny Halevy <[email protected]>
[pnfsblock: fix bug determining size of striped volume]
[pnfsblock: fix oops when using multiple devices]
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[pnfsblock: get rid of vmap and deviceid->area structure]
Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 138 ++++++++++++++++++++++++++++++++++-
fs/nfs/blocklayout/blocklayout.h | 13 +++-
fs/nfs/blocklayout/blocklayoutdev.c | 13 +++-
fs/nfs/pnfs.h | 1 -
4 files changed, 157 insertions(+), 8 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 1858743..3bf60e3 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -163,17 +163,153 @@ bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo,
{
}
+static void free_blk_mountid(struct block_mount_id *mid)
+{
+ if (mid) {
+ struct pnfs_block_dev *dev;
+ spin_lock(&mid->bm_lock);
+ while (!list_empty(&mid->bm_devlist)) {
+ dev = list_first_entry(&mid->bm_devlist,
+ struct pnfs_block_dev,
+ bm_node);
+ list_del(&dev->bm_node);
+ free_block_dev(dev);
+ }
+ spin_unlock(&mid->bm_lock);
+ kfree(mid);
+ }
+}
+
+/* This is mostly copied from the filelayout's get_device_info function.
+ * It seems much of this should be at the generic pnfs level.
+ */
+static struct pnfs_block_dev *
+nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
+ struct nfs4_deviceid *d_id)
+{
+ struct pnfs_device *dev;
+ struct pnfs_block_dev *rv = NULL;
+ u32 max_resp_sz;
+ int max_pages;
+ struct page **pages = NULL;
+ int i, rc;
+
+ /*
+ * Use the session max response size as the basis for setting
+ * GETDEVICEINFO's maxcount
+ */
+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+ max_pages = max_resp_sz >> PAGE_SHIFT;
+ dprintk("%s max_resp_sz %u max_pages %d\n",
+ __func__, max_resp_sz, max_pages);
+
+ dev = kmalloc(sizeof(*dev), GFP_NOFS);
+ if (!dev) {
+ dprintk("%s kmalloc failed\n", __func__);
+ return NULL;
+ }
+
+ pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
+ if (pages == NULL) {
+ kfree(dev);
+ return NULL;
+ }
+ for (i = 0; i < max_pages; i++) {
+ pages[i] = alloc_page(GFP_NOFS);
+ if (!pages[i])
+ goto out_free;
+ }
+
+ memcpy(&dev->dev_id, d_id, sizeof(*d_id));
+ dev->layout_type = LAYOUT_BLOCK_VOLUME;
+ dev->pages = pages;
+ dev->pgbase = 0;
+ dev->pglen = PAGE_SIZE * max_pages;
+ dev->mincount = 0;
+
+ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
+ rc = nfs4_proc_getdeviceinfo(server, dev);
+ dprintk("%s getdevice info returns %d\n", __func__, rc);
+ if (rc)
+ goto out_free;
+
+ rv = nfs4_blk_decode_device(server, dev);
+ out_free:
+ for (i = 0; i < max_pages; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+ kfree(dev);
+ return rv;
+}
+
static int
bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
{
+ struct block_mount_id *b_mt_id = NULL;
+ struct pnfs_devicelist *dlist = NULL;
+ struct pnfs_block_dev *bdev;
+ LIST_HEAD(block_disklist);
+ int status = 0, i;
+
dprintk("%s enter\n", __func__);
- return 0;
+
+ if (server->pnfs_blksize == 0) {
+ dprintk("%s Server did not return blksize\n", __func__);
+ return -EINVAL;
+ }
+ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
+ if (!b_mt_id) {
+ status = -ENOMEM;
+ goto out_error;
+ }
+ /* Initialize nfs4 block layout mount id */
+ spin_lock_init(&b_mt_id->bm_lock);
+ INIT_LIST_HEAD(&b_mt_id->bm_devlist);
+
+ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
+ if (!dlist) {
+ status = -ENOMEM;
+ goto out_error;
+ }
+ dlist->eof = 0;
+ while (!dlist->eof) {
+ status = nfs4_proc_getdevicelist(server, fh, dlist);
+ if (status)
+ goto out_error;
+ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
+ __func__, dlist->num_devs, dlist->eof);
+ for (i = 0; i < dlist->num_devs; i++) {
+ bdev = nfs4_blk_get_deviceinfo(server, fh,
+ &dlist->dev_id[i]);
+ if (!bdev) {
+ status = -ENODEV;
+ goto out_error;
+ }
+ spin_lock(&b_mt_id->bm_lock);
+ list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
+ spin_unlock(&b_mt_id->bm_lock);
+ }
+ }
+ dprintk("%s SUCCESS\n", __func__);
+ server->pnfs_ld_data = b_mt_id;
+
+ out_return:
+ kfree(dlist);
+ return status;
+
+ out_error:
+ free_blk_mountid(b_mt_id);
+ goto out_return;
}
static int
bl_clear_layoutdriver(struct nfs_server *server)
{
+ struct block_mount_id *b_mt_id = server->pnfs_ld_data;
+
dprintk("%s enter\n", __func__);
+ free_blk_mountid(b_mt_id);
+ dprintk("%s RETURNS\n", __func__);
return 0;
}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index ff140e7..ed6253e 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -38,6 +38,11 @@
#include "../pnfs.h"
+struct block_mount_id {
+ spinlock_t bm_lock; /* protects list */
+ struct list_head bm_devlist; /* holds pnfs_block_dev */
+};
+
struct pnfs_block_dev {
struct list_head bm_node;
struct nfs4_deviceid bm_mdevid; /* associated devid */
@@ -99,7 +104,10 @@ struct pnfs_block_layout {
sector_t bl_blocksize; /* Server blocksize in sectors */
};
-static inline struct pnfs_block_layout *BLK_LO2EXT(struct pnfs_layout_hdr *lo)
+#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
+
+static inline struct pnfs_block_layout *
+BLK_LO2EXT(struct pnfs_layout_hdr *lo)
{
return container_of(lo, struct pnfs_block_layout, bl_layout);
}
@@ -137,8 +145,7 @@ void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
struct block_device *nfs4_blkdev_get(dev_t dev);
int nfs4_blkdev_put(struct block_device *bdev);
struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
- struct pnfs_device *dev,
- struct list_head *sdlist);
+ struct pnfs_device *dev);
int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index 64da33a..b23fe60 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -116,8 +116,7 @@ void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
*/
struct pnfs_block_dev *
nfs4_blk_decode_device(struct nfs_server *server,
- struct pnfs_device *dev,
- struct list_head *sdlist)
+ struct pnfs_device *dev)
{
struct pnfs_block_dev *rv = NULL;
struct block_device *bd = NULL;
@@ -129,6 +128,7 @@ nfs4_blk_decode_device(struct nfs_server *server,
uint8_t *dataptr;
DECLARE_WAITQUEUE(wq, current);
struct bl_dev_msg *reply = &bl_mount_reply;
+ int offset, len, i;
dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
@@ -143,7 +143,14 @@ nfs4_blk_decode_device(struct nfs_server *server,
memcpy(msg.data, &bl_msg, sizeof(bl_msg));
dataptr = (uint8_t *) msg.data;
- memcpy(&dataptr[sizeof(bl_msg)], dev->area, dev->mincount);
+ len = dev->mincount;
+ offset = sizeof(bl_msg);
+ for (i = 0; len > 0; i++) {
+ memcpy(&dataptr[offset], page_address(dev->pages[i]),
+ len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
+ len -= PAGE_CACHE_SIZE;
+ offset += PAGE_CACHE_SIZE;
+ }
msg.len = sizeof(bl_msg) + dev->mincount;
dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f271425..82dde37 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -141,7 +141,6 @@ struct pnfs_device {
unsigned int layout_type;
unsigned int mincount;
struct page **pages;
- void *area;
unsigned int pgbase;
unsigned int pglen;
};
--
1.7.4.1
From: Fred Isaman <[email protected]>
XDR decodes the block layout payload sent in LAYOUTGET result, storing
the result in an extent list.
[pnfsblock: get rid of deprecated xdr macros]
Signed-off-by: Jim Rees <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
[pnfsblock: fix bug getting pnfs_layout_type in translate_devid().]
Signed-off-by: Tao Guo <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/blocklayoutdev.c | 208 ++++++++++++++++++++++++++++++++++-
1 files changed, 206 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index b23fe60..3bf8358 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -40,6 +40,19 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+static int decode_sector_number(__be32 **rp, sector_t *sp)
+{
+ uint64_t s;
+
+ *rp = xdr_decode_hyper(*rp, &s);
+ if (s & 0x1ff) {
+ printk(KERN_WARNING "%s: sector not aligned\n", __func__);
+ return -1;
+ }
+ *sp = s >> SECTOR_SHIFT;
+ return 0;
+}
+
/* Open a block_device by device number. */
struct block_device *nfs4_blkdev_get(dev_t dev)
{
@@ -197,10 +210,201 @@ out:
return rv;
}
+/* Map deviceid returned by the server to constructed block_device */
+static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
+ struct nfs4_deviceid *id)
+{
+ struct block_device *rv = NULL;
+ struct block_mount_id *mid;
+ struct pnfs_block_dev *dev;
+
+ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
+ mid = BLK_ID(lo);
+ spin_lock(&mid->bm_lock);
+ list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
+ if (memcmp(id->data, dev->bm_mdevid.data,
+ NFS4_DEVICEID4_SIZE) == 0) {
+ rv = dev->bm_mdev;
+ goto out;
+ }
+ }
+ out:
+ spin_unlock(&mid->bm_lock);
+ dprintk("%s returning %p\n", __func__, rv);
+ return rv;
+}
+
+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+ u32 mode; /* R or RW */
+ u64 start; /* Expected start of next non-COW extent */
+ u64 inval; /* Start of INVAL coverage */
+ u64 cowread; /* End of COW read coverage */
+};
+
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+ struct layout_verification *lv)
+{
+ if (lv->mode == IOMODE_READ) {
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+ be->be_state == PNFS_BLOCK_INVALID_DATA)
+ return -EIO;
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ }
+ /* lv->mode == IOMODE_RW */
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ if (lv->cowread > lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ lv->inval = lv->start;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+ if (be->be_f_offset > lv->start)
+ return -EIO;
+ if (be->be_f_offset < lv->inval)
+ return -EIO;
+ if (be->be_f_offset < lv->cowread)
+ return -EIO;
+ /* It looks like you might want to min this with lv->start,
+ * but you really don't.
+ */
+ lv->inval = lv->inval + be->be_length;
+ lv->cowread = be->be_f_offset + be->be_length;
+ return 0;
+ } else
+ return -EIO;
+}
+
+/* XDR decode pnfs_block_layout4 structure */
int
nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
{
- /* STUB */
- return -EIO;
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ int i, status = -EIO;
+ uint32_t count;
+ struct pnfs_block_extent *be = NULL, *save;
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ __be32 *p;
+ struct layout_verification lv = {
+ .mode = lgr->range.iomode,
+ .start = lgr->range.offset >> SECTOR_SHIFT,
+ .inval = lgr->range.offset >> SECTOR_SHIFT,
+ .cowread = lgr->range.offset >> SECTOR_SHIFT,
+ };
+ LIST_HEAD(extents);
+
+ dprintk("---> %s\n", __func__);
+
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ return -ENOMEM;
+
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err;
+
+ count = be32_to_cpup(p++);
+
+ dprintk("%s enter, number of extents %i\n", __func__, count);
+ p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
+ if (unlikely(!p))
+ goto out_err;
+
+ /* Decode individual extents, putting them in temporary
+ * staging area until whole layout is decoded to make error
+ * recovery easier.
+ */
+ for (i = 0; i < count; i++) {
+ be = alloc_extent();
+ if (!be) {
+ status = -ENOMEM;
+ goto out_err;
+ }
+ memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+ be->be_mdev = translate_devid(lo, &be->be_devid);
+ if (!be->be_mdev)
+ goto out_err;
+
+ /* The next three values are read in as bytes,
+ * but stored as 512-byte sector lengths
+ */
+ if (decode_sector_number(&p, &be->be_f_offset) < 0)
+ goto out_err;
+ if (decode_sector_number(&p, &be->be_length) < 0)
+ goto out_err;
+ if (decode_sector_number(&p, &be->be_v_offset) < 0)
+ goto out_err;
+ be->be_state = be32_to_cpup(p++);
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+ be->be_inval = &bl->bl_inval;
+ if (verify_extent(be, &lv)) {
+ dprintk("%s verify failed\n", __func__);
+ goto out_err;
+ }
+ list_add_tail(&be->be_node, &extents);
+ }
+ if (lgr->range.offset + lgr->range.length !=
+ lv.start << SECTOR_SHIFT) {
+ dprintk("%s Final length mismatch\n", __func__);
+ be = NULL;
+ goto out_err;
+ }
+ if (lv.start < lv.cowread) {
+ dprintk("%s Final uncovered COW extent\n", __func__);
+ be = NULL;
+ goto out_err;
+ }
+ /* Extents decoded properly, now try to merge them in to
+ * existing layout extents.
+ */
+ spin_lock(&bl->bl_ext_lock);
+ list_for_each_entry_safe(be, save, &extents, be_node) {
+ list_del(&be->be_node);
+ status = bl_add_merge_extent(bl, be);
+ if (status) {
+ spin_unlock(&bl->bl_ext_lock);
+ /* This is a fairly catastrophic error, as the
+ * entire layout extent lists are now corrupted.
+ * We should have some way to distinguish this.
+ */
+ be = NULL;
+ goto out_err;
+ }
+ }
+ spin_unlock(&bl->bl_ext_lock);
+ status = 0;
+ out:
+ __free_page(scratch);
+ dprintk("%s returns %i\n", __func__, status);
+ return status;
+
+ out_err:
+ bl_put_extent(be);
+ while (!list_empty(&extents)) {
+ be = list_first_entry(&extents, struct pnfs_block_extent,
+ be_node);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ }
+ goto out;
}
--
1.7.4.1
From: Fred Isaman <[email protected]>
Block layout needs it to determine IO size.
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Tao Guo <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/client.c | 1 +
fs/nfs/nfs4_fs.h | 2 +-
fs/nfs/nfs4proc.c | 5 +-
fs/nfs/nfs4xdr.c | 99 +++++++++++++++++++++++++++++++++++++--------
include/linux/nfs_fs_sb.h | 4 +-
include/linux/nfs_xdr.h | 3 +-
6 files changed, 92 insertions(+), 22 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a9b1848..de00a37 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -936,6 +936,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
server->wsize = NFS_MAX_FILE_IO_SIZE;
server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ server->pnfs_blksize = fsinfo->blksize;
set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c30aed2..b7ad2f0 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -318,7 +318,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
extern const u32 nfs4_fattr_bitmap[2];
extern const u32 nfs4_statfs_bitmap[2];
extern const u32 nfs4_pathconf_bitmap[2];
-extern const u32 nfs4_fsinfo_bitmap[2];
+extern const u32 nfs4_fsinfo_bitmap[3];
extern const u32 nfs4_fs_locations_bitmap[2];
/* nfs4renewd.c */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 784c1a2..e02f545 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -140,12 +140,13 @@ const u32 nfs4_pathconf_bitmap[2] = {
0
};
-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
+const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
| FATTR4_WORD0_MAXREAD
| FATTR4_WORD0_MAXWRITE
| FATTR4_WORD0_LEASE_TIME,
FATTR4_WORD1_TIME_DELTA
- | FATTR4_WORD1_FS_LAYOUT_TYPES
+ | FATTR4_WORD1_FS_LAYOUT_TYPES,
+ FATTR4_WORD2_LAYOUT_BLKSIZE
};
const u32 nfs4_fs_locations_bitmap[2] = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a82dd40..5ce3c64 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int);
#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
#define decode_restorefh_maxsz (op_decode_hdr_maxsz)
#define encode_fsinfo_maxsz (encode_getattr_maxsz)
-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15)
+/* The 5 accounts for the PNFS attributes, and assumes that at most three
+ * layout types will be returned.
+ */
+#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \
+ nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
#define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
#define decode_renew_maxsz (op_decode_hdr_maxsz)
#define encode_setclientid_maxsz \
@@ -1123,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
hdr->replen += decode_getattr_maxsz;
}
+static void
+encode_getattr_three(struct xdr_stream *xdr,
+ uint32_t bm0, uint32_t bm1, uint32_t bm2,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_GETATTR);
+ if (bm2) {
+ p = reserve_space(xdr, 16);
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(bm0);
+ *p++ = cpu_to_be32(bm1);
+ *p = cpu_to_be32(bm2);
+ } else if (bm1) {
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(bm0);
+ *p = cpu_to_be32(bm1);
+ } else {
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(1);
+ *p = cpu_to_be32(bm0);
+ }
+ hdr->nops++;
+ hdr->replen += decode_getattr_maxsz;
+}
+
static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
@@ -1131,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
+ encode_getattr_three(xdr,
+ bitmask[0] & nfs4_fsinfo_bitmap[0],
+ bitmask[1] & nfs4_fsinfo_bitmap[1],
+ bitmask[2] & nfs4_fsinfo_bitmap[2],
+ hdr);
}
static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
@@ -2643,7 +2679,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
struct compound_hdr hdr = {
.nops = 0,
};
- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
encode_compound_hdr(xdr, req, &hdr);
encode_setclientid_confirm(xdr, arg, &hdr);
@@ -2787,7 +2823,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
};
- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
+ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->la_seq_args, &hdr);
@@ -3068,14 +3104,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
goto out_overflow;
bmlen = be32_to_cpup(p);
- bitmap[0] = bitmap[1] = 0;
+ bitmap[0] = bitmap[1] = bitmap[2] = 0;
p = xdr_inline_decode(xdr, (bmlen << 2));
if (unlikely(!p))
goto out_overflow;
if (bmlen > 0) {
bitmap[0] = be32_to_cpup(p++);
- if (bmlen > 1)
- bitmap[1] = be32_to_cpup(p);
+ if (bmlen > 1) {
+ bitmap[1] = be32_to_cpup(p++);
+ if (bmlen > 2)
+ bitmap[2] = be32_to_cpup(p);
+ }
}
return 0;
out_overflow:
@@ -3107,8 +3146,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
return ret;
bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
} else
- bitmask[0] = bitmask[1] = 0;
- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
+ bitmask[0] = bitmask[1] = bitmask[2] = 0;
+ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+ bitmask[0], bitmask[1], bitmask[2]);
return 0;
}
@@ -4162,7 +4202,7 @@ out_overflow:
static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
{
__be32 *savep;
- uint32_t attrlen, bitmap[2] = {0};
+ uint32_t attrlen, bitmap[3] = {0};
int status;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4188,7 +4228,7 @@ xdr_error:
static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
{
__be32 *savep;
- uint32_t attrlen, bitmap[2] = {0};
+ uint32_t attrlen, bitmap[3] = {0};
int status;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4220,7 +4260,7 @@ xdr_error:
static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
{
__be32 *savep;
- uint32_t attrlen, bitmap[2] = {0};
+ uint32_t attrlen, bitmap[3] = {0};
int status;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4360,7 +4400,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
{
__be32 *savep;
uint32_t attrlen,
- bitmap[2] = {0};
+ bitmap[3] = {0};
int status;
status = decode_op_hdr(xdr, OP_GETATTR);
@@ -4446,10 +4486,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
return status;
}
+/*
+ * The prefered block size for layout directed io
+ */
+static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ *res = 0;
+ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p)) {
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+ }
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
+ }
+ return 0;
+}
+
static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
{
__be32 *savep;
- uint32_t attrlen, bitmap[2];
+ uint32_t attrlen, bitmap[3];
int status;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -4477,6 +4539,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
if (status != 0)
goto xdr_error;
+ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
+ if (status)
+ goto xdr_error;
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
@@ -4896,7 +4961,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
{
__be32 *savep;
uint32_t attrlen,
- bitmap[2] = {0};
+ bitmap[3] = {0};
struct kvec *iov = req->rq_rcv_buf.head;
int status;
@@ -6849,7 +6914,7 @@ out:
int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
int plus)
{
- uint32_t bitmap[2] = {0};
+ uint32_t bitmap[3] = {0};
uint32_t len;
__be32 *p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 4faeac8..6e6ab4a 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -132,7 +132,7 @@ struct nfs_server {
#endif
#ifdef CONFIG_NFS_V4
- u32 attr_bitmask[2];/* V4 bitmask representing the set
+ u32 attr_bitmask[3];/* V4 bitmask representing the set
of attributes supported on this
filesystem */
u32 cache_consistency_bitmask[2];
@@ -145,6 +145,8 @@ struct nfs_server {
filesystem */
struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */
struct rpc_wait_queue roc_rpcwaitq;
+ void *pnfs_ld_data; /* per mount point data */
+ u32 pnfs_blksize; /* layout_blksize attr */
/* the following fields are protected by nfs_client->cl_lock */
struct rb_root state_owners;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 21f333e..94f27e5 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -122,6 +122,7 @@ struct nfs_fsinfo {
struct timespec time_delta; /* server time granularity */
__u32 lease_time; /* in seconds */
__u32 layouttype; /* supported pnfs layout driver */
+ __u32 blksize; /* preferred pnfs io block size */
};
struct nfs_fsstat {
@@ -954,7 +955,7 @@ struct nfs4_server_caps_arg {
};
struct nfs4_server_caps_res {
- u32 attr_bitmask[2];
+ u32 attr_bitmask[3];
u32 acl_bitmask;
u32 has_links;
u32 has_symlinks;
--
1.7.4.1
From: Fred Isaman <[email protected]>
In blocklayout driver. There are two things happening
while layoutcommit/cleanup.
1. the modified extents are encoded.
2. On cleanup the extents are put back on the layout rw
extents list, for reads.
In the new system where actual xdr encoding is done in
encode_layoutcommit() directly into xdr buffer, these are
the new commit stages:
1. On setup_layoutcommit, the range is adjusted as before
and a structure is allocated for communication with
bl_encode_layoutcommit && bl_cleanup_layoutcommit
(Generic layer provides a void-star to hang it on)
2. bl_encode_layoutcommit is called to do the actual
encoding directly into xdr. The commit-extent-list is not
freed and is stored on above structure.
FIXME: The code is not yet converted to the new XDR cleanup
3. On cleanup the commit-extent-list is put back by a call
to set_to_rw() as before, but with no need for XDR decoding
of the list as before. And the commit-extent-list is freed.
Finally allocated structure is freed.
[SQUASHME: pnfs: blocklayout: port block layout code]
Signed-off-by: Peng Tao <[email protected]>
[pnfsblock: SQUASHME: adjust to API change]
Signed-off-by: Fred Isaman <[email protected]>
[blocklayout: encode_layoutcommit implementation]
Signed-off-by: Boaz Harrosh <[email protected]>
[pnfsblock: fix bug setting up layoutcommit.]
Signed-off-by: Tao Guo <[email protected]>
[pnfsblock: cleanup_layoutcommit wants a status parameter]
Signed-off-by: Boaz Harrosh <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 2 +
fs/nfs/blocklayout/blocklayout.h | 3 +
fs/nfs/blocklayout/extents.c | 210 ++++++++++++++++++++++++++++++++++++++
3 files changed, 215 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 36fd19c..300a678 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -168,6 +168,8 @@ static void
bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo,
struct nfs4_layoutcommit_data *lcdata)
{
+ dprintk("%s enter\n", __func__);
+ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
}
static void free_blk_mountid(struct block_mount_id *mid)
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index edbe6a9..197f919 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -197,6 +197,9 @@ int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect);
int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *arg);
+void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ const struct nfs4_layoutcommit_args *arg,
+ int status);
int bl_add_merge_extent(struct pnfs_block_layout *bl,
struct pnfs_block_extent *new);
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 462a270..483c235 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -329,6 +329,73 @@ void print_clist(struct list_head *list, unsigned int count)
}
}
+/* Note: In theory, we should do more checking that devid's match between
+ * old and new, but if they don't, the lists are too corrupt to salvage anyway.
+ */
+/* Note this is very similar to bl_add_merge_extent */
+static void add_to_commitlist(struct pnfs_block_layout *bl,
+ struct pnfs_block_short_extent *new)
+{
+ struct list_head *clist = &bl->bl_commit;
+ struct pnfs_block_short_extent *old, *save;
+ sector_t end = new->bse_f_offset + new->bse_length;
+
+ dprintk("%s enter\n", __func__);
+ print_short_extent(new);
+ print_clist(clist, bl->bl_count);
+ bl->bl_count++;
+ /* Scan for proper place to insert, extending new to the left
+ * as much as possible.
+ */
+ list_for_each_entry_safe(old, save, clist, bse_node) {
+ if (new->bse_f_offset < old->bse_f_offset)
+ break;
+ if (end <= old->bse_f_offset + old->bse_length) {
+ /* Range is already in list */
+ bl->bl_count--;
+ kfree(new);
+ return;
+ } else if (new->bse_f_offset <=
+ old->bse_f_offset + old->bse_length) {
+ /* new overlaps or abuts existing be */
+ if (new->bse_mdev == old->bse_mdev) {
+ /* extend new to fully replace old */
+ new->bse_length += new->bse_f_offset -
+ old->bse_f_offset;
+ new->bse_f_offset = old->bse_f_offset;
+ list_del(&old->bse_node);
+ bl->bl_count--;
+ kfree(old);
+ }
+ }
+ }
+ /* Note that if we never hit the above break, old will not point to a
+ * valid extent. However, in that case &old->bse_node==list.
+ */
+ list_add_tail(&new->bse_node, &old->bse_node);
+ /* Scan forward for overlaps. If we find any, extend new and
+ * remove the overlapped extent.
+ */
+ old = list_prepare_entry(new, clist, bse_node);
+ list_for_each_entry_safe_continue(old, save, clist, bse_node) {
+ if (end < old->bse_f_offset)
+ break;
+ /* new overlaps or abuts old */
+ if (new->bse_mdev == old->bse_mdev) {
+ if (end < old->bse_f_offset + old->bse_length) {
+ /* extend new to fully cover old */
+ end = old->bse_f_offset + old->bse_length;
+ new->bse_length = end - new->bse_f_offset;
+ }
+ list_del(&old->bse_node);
+ bl->bl_count--;
+ kfree(old);
+ }
+ }
+ dprintk("%s: after merging\n", __func__);
+ print_clist(clist, bl->bl_count);
+}
+
static void print_bl_extent(struct pnfs_block_extent *be)
{
dprintk("PRINT EXTENT extent %p\n", be);
@@ -547,6 +614,34 @@ bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
return ret;
}
+/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
+static struct pnfs_block_extent *
+bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
+{
+ struct pnfs_block_extent *be, *ret = NULL;
+ int i;
+
+ dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+ for (i = 0; i < EXTENT_LISTS; i++) {
+ if (ret)
+ break;
+ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+ if (isect >= be->be_f_offset + be->be_length)
+ break;
+ if (isect >= be->be_f_offset) {
+ /* We have found an extent */
+ dprintk("%s Get %p (%i)\n", __func__, be,
+ atomic_read(&be->be_refcnt.refcount));
+ kref_get(&be->be_refcnt);
+ ret = be;
+ break;
+ }
+ }
+ }
+ print_bl_extent(ret);
+ return ret;
+}
+
int
encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
struct xdr_stream *xdr,
@@ -636,3 +731,118 @@ _front_merge(struct pnfs_block_extent *be, struct list_head *head,
kfree(storage);
return be;
}
+
+static u64
+set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
+{
+ u64 rv = offset + length;
+ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
+ struct pnfs_block_extent *children[3];
+ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
+ int i = 0, j;
+
+ dprintk("%s(%llu, %llu)\n", __func__, offset, length);
+ /* Create storage for up to three new extents e1, e2, e3 */
+ e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
+ e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
+ e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
+ /* BUG - we are ignoring any failure */
+ if (!e1 || !e2 || !e3)
+ goto out_nosplit;
+
+ spin_lock(&bl->bl_ext_lock);
+ be = bl_find_get_extent_locked(bl, offset);
+ rv = be->be_f_offset + be->be_length;
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
+ spin_unlock(&bl->bl_ext_lock);
+ goto out_nosplit;
+ }
+ /* Add e* to children, bumping e*'s krefs */
+ if (be->be_f_offset != offset) {
+ _prep_new_extent(e1, be, be->be_f_offset,
+ offset - be->be_f_offset,
+ PNFS_BLOCK_INVALID_DATA);
+ children[i++] = e1;
+ print_bl_extent(e1);
+ } else
+ merge1 = e1;
+ _prep_new_extent(e2, be, offset,
+ min(length, be->be_f_offset + be->be_length - offset),
+ PNFS_BLOCK_READWRITE_DATA);
+ children[i++] = e2;
+ print_bl_extent(e2);
+ if (offset + length < be->be_f_offset + be->be_length) {
+ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
+ be->be_f_offset + be->be_length -
+ offset - length,
+ PNFS_BLOCK_INVALID_DATA);
+ children[i++] = e3;
+ print_bl_extent(e3);
+ } else
+ merge2 = e3;
+
+ /* Remove be from list, and insert the e* */
+ /* We don't get refs on e*, since this list is the base reference
+ * set when init'ed.
+ */
+ if (i < 3)
+ children[i] = NULL;
+ new = children[0];
+ list_replace(&be->be_node, &new->be_node);
+ bl_put_extent(be);
+ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
+ for (j = 1; j < i; j++) {
+ old = new;
+ new = children[j];
+ list_add(&new->be_node, &old->be_node);
+ }
+ if (merge2) {
+ /* This is a HACK, should just create a _back_merge function */
+ new = list_entry(new->be_node.next,
+ struct pnfs_block_extent, be_node);
+ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
+ }
+ spin_unlock(&bl->bl_ext_lock);
+
+ /* Since we removed the base reference above, be is now scheduled for
+ * destruction.
+ */
+ bl_put_extent(be);
+ dprintk("%s returns %llu after split\n", __func__, rv);
+ return rv;
+
+ out_nosplit:
+ kfree(e1);
+ kfree(e2);
+ kfree(e3);
+ dprintk("%s returns %llu without splitting\n", __func__, rv);
+ return rv;
+}
+
+void
+clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
+ const struct nfs4_layoutcommit_args *arg,
+ int status)
+{
+ struct pnfs_block_short_extent *lce, *save;
+
+ dprintk("%s status %d\n", __func__, status);
+ list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
+ if (likely(!status)) {
+ u64 offset = lce->bse_f_offset;
+ u64 end = offset + lce->bse_length;
+
+ do {
+ offset = set_to_rw(bl, offset, end - offset);
+ } while (offset < end);
+ list_del(&lce->bse_node);
+
+ kfree(lce);
+ } else {
+ list_del(&lce->bse_node);
+ spin_lock(&bl->bl_ext_lock);
+ add_to_commitlist(bl, lce);
+ spin_unlock(&bl->bl_ext_lock);
+ }
+ }
+}
--
1.7.4.1
Hi, Trond and Benny,
On Tue, Jul 26, 2011 at 2:26 AM, Benny Halevy <[email protected]> wrote:
> On 2011-07-25 13:25, Myklebust, Trond wrote:
>>> -----Original Message-----
>>> From: Benny Halevy [mailto:[email protected]]
>>> Sent: Monday, July 25, 2011 10:50 AM
>>> To: Myklebust, Trond
>>> Cc: Jim Rees; [email protected]; peter honeyman
>>> Subject: Re: [PATCH v2 07/25] pnfsblock: add blocklayout Kconfig
>>> option, Makefile, and stubs
>>>
>>> On 2011-07-25 10:38, Myklebust, Trond wrote:
>>>>> -----Original Message-----
>>>>> From: Benny Halevy [mailto:[email protected]]
>>>>> Sent: Monday, July 25, 2011 10:31 AM
>>>>> To: Jim Rees
>>>>> Cc: Myklebust, Trond; [email protected]; peter honeyman
>>>>> Subject: Re: [PATCH v2 07/25] pnfsblock: add blocklayout Kconfig
>>>>> option, Makefile, and stubs
>>>>>
>>>>> On 2011-07-21 15:34, Jim Rees wrote:
>>>>>> From: Fred Isaman <[email protected]>
>>>>>>
>>>>>> Define a configuration variable to enable/disable compilation of
>>> the
>>>>>> block driver code.
>>>>>>
>>>>>> Add the minimal structure for a pnfs block layout driver, and
>> empty
>>>>>> list-heads that will hold the extent data
>>>>>>
>>>>>> [pnfsblock: make NFS_V4_1 select PNFS_BLOCK]
>>>>>> Signed-off-by: Peng Tao <[email protected]>
>>>>>> Signed-off-by: Fred Isaman <[email protected]>
>>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>>> [pnfs-block: fix CONFIG_PNFS_BLOCK dependencies]
>>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>>> [pnfsblock: SQUASHME: port block layout code]
>>>>>> Signed-off-by: Peng Tao <[email protected]>
>>>>>> [pnfsblock: SQUASHME: adjust to API change]
>>>>>> Signed-off-by: Fred Isaman <[email protected]>
>>>>>> [pnfs: move pnfs_layout_type inline in nfs_inode]
>>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>>> [blocklayout: encode_layoutcommit implementation]
>>>>>> Signed-off-by: Boaz Harrosh <[email protected]>
>>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>>> [pnfsblock: layout alloc and free]
>>>>>> Signed-off-by: Fred Isaman <[email protected]>
>>>>>> [pnfs: move pnfs_layout_type inline in nfs_inode]
>>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>>> [pnfsblock: define module alias]
>>>>>> Signed-off-by: Peng Tao <[email protected]>
>>>>>> ---
>>>>>>  fs/nfs/Kconfig          |   8 ++-
>>>>>>  fs/nfs/Makefile          |   1 +
>>>>>>  fs/nfs/blocklayout/Makefile    |   5 +
>>>>>> Â fs/nfs/blocklayout/blocklayout.c | Â 175
>>>>> ++++++++++++++++++++++++++++++++++++++
>>>>>> Â fs/nfs/blocklayout/blocklayout.h | Â 91 ++++++++++++++++++++
>>>>>> Â 5 files changed, 279 insertions(+), 1 deletions(-)
>>>>>> Â create mode 100644 fs/nfs/blocklayout/Makefile
>>>>>> Â create mode 100644 fs/nfs/blocklayout/blocklayout.c
>>>>>> Â create mode 100644 fs/nfs/blocklayout/blocklayout.h
>>>>>>
>>>>>> diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
>>>>>> index 2cde5d9..be02077 100644
>>>>>> --- a/fs/nfs/Kconfig
>>>>>> +++ b/fs/nfs/Kconfig
>>>>>> @@ -79,15 +79,21 @@ config NFS_V4_1
>>>>>> Â depends on NFS_FS && NFS_V4 && EXPERIMENTAL
>>>>>> Â select SUNRPC_BACKCHANNEL
>>>>>> Â select PNFS_FILE_LAYOUT
>>>>>> + select PNFS_BLOCK
>>>>>> + select MD
>>>>>> + select BLK_DEV_DM
>>>>>
>>>>> Why is PNFS_BLOCK enabled automatically in all cases?
>>>>> That renders the use of modules for layout drivers totally useless.
>>>>> I sort of understand that for PNFS_FILE_LAYOUT (when my
>>>>> arm is twisted really hard behind my back :) since it
>>>>> is an integral part of RFC5661 but what's the justification
>>>>> for PNFS_BLOCK? and why blocks and not objects?
>>>>
>>>> The question is rather why did objects add a selectable compile
>>> option?
>>>
>>> Just good citizenship :)
>>>
>>>> What is the point of not compiling a given layout driver if all the
>>>> dependencies are met?
>>>
>>> Reducing build times...
>>> Building a smaller kernel when modules are disabled...
>>
>>
>> You can add a line with
>> Â Â Â depends on m
>>
>> to ensure that it is always compiled as a module. I think that might be
>> a good thing until we have nailed down all the issues with pNFS.
>>
>
> I'd rather leave it as is so it's easier to test without CONFIG_MODULES.
>
>>> We're fine in terms of memory consumption when CONFIG_MODULES=y since
>>> the
>>> layout driver is loaded on demand but shouldn't be worried about
>>> the other case?
>>>
>>>>
>>>> IOW: The only thing I'd change above is the select MD and select
>>>> BLK_DEV_DM: I'd prefer something like
>>>>
>>>> config PNFS_BLOCK
>>>> Â Â depends on NFS_V4_1 && MD && BLK_DEV_DM
>>>> Â Â default y
>>>
>>> This is closer to the original version.
>>> However, selecting MD and BLK_DEV_DM was proven useful to
>> automatically
>>> take
>>> care of the module dependencies without having to dive into details.
>>
>> Yes, but since the MD is a completely different layer that is not under
>> our control (well, OK, Neil is still an NFS maintainer and an MD
>> maintainer) then I'd prefer to leave it as a dependency.
>>
>> We can always add something like
>>
>> comment
>> Â Â Â depends on NFS_V4_1 && !BLK_DEV_DM
>> Â Â Â Please enable BLK_DEV_MD if you wish to enable the pNFS block
>> driver.
>
> I never new you can enable comments conditionally this way...
> It looks ok to me, I'll try it out and see how it shows in make *config
I tried the above and see some issue with this approach.
Because BLK_DEV_DM is not bool, the test !BLK_DEV_DM will not work as we wanted.
With the two lines:
comment "Please enable BLK_DEV_MD if you wish to enable the pNFS block"
depends on NFS_V4_1 && !BLK_DEV_DM
The comment will always show up there...
>
> Benny
>
>>
>>
>> Cheers
>> Â Trond
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> the body of a message to [email protected]
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
Thanks,
-Bergwolf
> -----Original Message-----
> From: Benny Halevy [mailto:[email protected]]
> Sent: Monday, July 25, 2011 10:31 AM
> To: Jim Rees
> Cc: Myklebust, Trond; [email protected]; peter honeyman
> Subject: Re: [PATCH v2 07/25] pnfsblock: add blocklayout Kconfig
> option, Makefile, and stubs
>
> On 2011-07-21 15:34, Jim Rees wrote:
> > From: Fred Isaman <[email protected]>
> >
> > Define a configuration variable to enable/disable compilation of the
> > block driver code.
> >
> > Add the minimal structure for a pnfs block layout driver, and empty
> > list-heads that will hold the extent data
> >
> > [pnfsblock: make NFS_V4_1 select PNFS_BLOCK]
> > Signed-off-by: Peng Tao <[email protected]>
> > Signed-off-by: Fred Isaman <[email protected]>
> > Signed-off-by: Benny Halevy <[email protected]>
> > [pnfs-block: fix CONFIG_PNFS_BLOCK dependencies]
> > Signed-off-by: Benny Halevy <[email protected]>
> > Signed-off-by: Benny Halevy <[email protected]>
> > [pnfsblock: SQUASHME: port block layout code]
> > Signed-off-by: Peng Tao <[email protected]>
> > [pnfsblock: SQUASHME: adjust to API change]
> > Signed-off-by: Fred Isaman <[email protected]>
> > [pnfs: move pnfs_layout_type inline in nfs_inode]
> > Signed-off-by: Benny Halevy <[email protected]>
> > [blocklayout: encode_layoutcommit implementation]
> > Signed-off-by: Boaz Harrosh <[email protected]>
> > Signed-off-by: Benny Halevy <[email protected]>
> > Signed-off-by: Benny Halevy <[email protected]>
> > [pnfsblock: layout alloc and free]
> > Signed-off-by: Fred Isaman <[email protected]>
> > [pnfs: move pnfs_layout_type inline in nfs_inode]
> > Signed-off-by: Benny Halevy <[email protected]>
> > Signed-off-by: Benny Halevy <[email protected]>
> > [pnfsblock: define module alias]
> > Signed-off-by: Peng Tao <[email protected]>
> > ---
> > fs/nfs/Kconfig | 8 ++-
> > fs/nfs/Makefile | 1 +
> > fs/nfs/blocklayout/Makefile | 5 +
> > fs/nfs/blocklayout/blocklayout.c | 175
> ++++++++++++++++++++++++++++++++++++++
> > fs/nfs/blocklayout/blocklayout.h | 91 ++++++++++++++++++++
> > 5 files changed, 279 insertions(+), 1 deletions(-)
> > create mode 100644 fs/nfs/blocklayout/Makefile
> > create mode 100644 fs/nfs/blocklayout/blocklayout.c
> > create mode 100644 fs/nfs/blocklayout/blocklayout.h
> >
> > diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
> > index 2cde5d9..be02077 100644
> > --- a/fs/nfs/Kconfig
> > +++ b/fs/nfs/Kconfig
> > @@ -79,15 +79,21 @@ config NFS_V4_1
> > depends on NFS_FS && NFS_V4 && EXPERIMENTAL
> > select SUNRPC_BACKCHANNEL
> > select PNFS_FILE_LAYOUT
> > + select PNFS_BLOCK
> > + select MD
> > + select BLK_DEV_DM
>
> Why is PNFS_BLOCK enabled automatically in all cases?
> That renders the use of modules for layout drivers totally useless.
> I sort of understand that for PNFS_FILE_LAYOUT (when my
> arm is twisted really hard behind my back :) since it
> is an integral part of RFC5661 but what's the justification
> for PNFS_BLOCK? and why blocks and not objects?
The question is rather why did objects add a selectable compile option?
What is the point of not compiling a given layout driver if all the
dependencies are met?
IOW: The only thing I'd change above is the select MD and select
BLK_DEV_DM: I'd prefer something like
config PNFS_BLOCK
depends on NFS_V4_1 && MD && BLK_DEV_DM
default y
Trond
From: Fred Isaman <[email protected]>
Adds working implementations of various support functions
to handle INVAL extents, needed by writes, such as
bl_mark_sectors_init and is_sector_initialized.
[pnfsblock: fix 64-bit compiler warnings for extent manipulation]
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[Implement release_inval_marks]
Signed-off-by: Zhang Jingwang <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 7 +-
fs/nfs/blocklayout/blocklayout.h | 31 +++++-
fs/nfs/blocklayout/extents.c | 253 ++++++++++++++++++++++++++++++++++++++
3 files changed, 288 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 3bf60e3..c4b584b 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -78,10 +78,15 @@ release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
spin_unlock(&bl->bl_ext_lock);
}
-/* STUB */
static void
release_inval_marks(struct pnfs_inval_markings *marks)
{
+ struct pnfs_inval_tracking *pos, *temp;
+
+ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
+ list_del(&pos->it_link);
+ kfree(pos);
+ }
return;
}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 05f2e54..37e5989 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -38,6 +38,9 @@
#include "../pnfs.h"
+#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
+#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+
struct block_mount_id {
spinlock_t bm_lock; /* protects list */
struct list_head bm_devlist; /* holds pnfs_block_dev */
@@ -56,8 +59,23 @@ enum exstate4 {
PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
};
+#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */
+
+struct my_tree {
+ sector_t mtt_step_size; /* Internal sector alignment */
+ struct list_head mtt_stub; /* Should be a radix tree */
+};
+
struct pnfs_inval_markings {
- /* STUB */
+ spinlock_t im_lock;
+ struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
+ sector_t im_block_size; /* Server blocksize in sectors */
+};
+
+struct pnfs_inval_tracking {
+ struct list_head it_link;
+ int it_sector;
+ int it_tags;
};
/* sector_t fields are all in 512-byte sectors */
@@ -76,7 +94,11 @@ struct pnfs_block_extent {
static inline void
INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
{
- /* STUB */
+ spin_lock_init(&marks->im_lock);
+ INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+ marks->im_block_size = blocksize;
+ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
+ blocksize);
}
enum extentclass4 {
@@ -156,8 +178,13 @@ void free_block_dev(struct pnfs_block_dev *bdev);
struct pnfs_block_extent *
bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
struct pnfs_block_extent **cow_read);
+int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
+ sector_t offset, sector_t length,
+ sector_t **pages);
void bl_put_extent(struct pnfs_block_extent *be);
struct pnfs_block_extent *alloc_extent(void);
+struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be);
+int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect);
int bl_add_merge_extent(struct pnfs_block_layout *bl,
struct pnfs_block_extent *new);
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 086ce36..4b58412 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -33,6 +33,259 @@
#include "blocklayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+/* Bit numbers */
+#define EXTENT_INITIALIZED 0
+#define EXTENT_WRITTEN 1
+#define EXTENT_IN_COMMIT 2
+#define INTERNAL_EXISTS MY_MAX_TAGS
+#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
+
+/* Returns largest t<=s s.t. t%base==0 */
+static inline sector_t normalize(sector_t s, int base)
+{
+ sector_t tmp = s; /* Since do_div modifies its argument */
+ return s - do_div(tmp, base);
+}
+
+static inline sector_t normalize_up(sector_t s, int base)
+{
+ return normalize(s + base - 1, base);
+}
+
+/* Complete stub using list while determine API wanted */
+
+/* Returns tags, or negative */
+static int32_t _find_entry(struct my_tree *tree, u64 s)
+{
+ struct pnfs_inval_tracking *pos;
+
+ dprintk("%s(%llu) enter\n", __func__, s);
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector > s)
+ continue;
+ else if (pos->it_sector == s)
+ return pos->it_tags & INTERNAL_MASK;
+ else
+ break;
+ }
+ return -ENOENT;
+}
+
+static inline
+int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
+{
+ int32_t tags;
+
+ dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
+ s = normalize(s, tree->mtt_step_size);
+ tags = _find_entry(tree, s);
+ if ((tags < 0) || !(tags & (1 << tag)))
+ return 0;
+ else
+ return 1;
+}
+
+/* Creates entry with tag, or if entry already exists, unions tag to it.
+ * If storage is not NULL, newly created entry will use it.
+ * Returns number of entries added, or negative on error.
+ */
+static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
+ struct pnfs_inval_tracking *storage)
+{
+ int found = 0;
+ struct pnfs_inval_tracking *pos;
+
+ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector > s)
+ continue;
+ else if (pos->it_sector == s) {
+ found = 1;
+ break;
+ } else
+ break;
+ }
+ if (found) {
+ pos->it_tags |= (1 << tag);
+ return 0;
+ } else {
+ struct pnfs_inval_tracking *new;
+ if (storage)
+ new = storage;
+ else {
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new)
+ return -ENOMEM;
+ }
+ new->it_sector = s;
+ new->it_tags = (1 << tag);
+ list_add(&new->it_link, &pos->it_link);
+ return 1;
+ }
+}
+
+/* XXXX Really want option to not create */
+/* Over range, unions tag with existing entries, else creates entry with tag */
+static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
+{
+ u64 i;
+
+ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
+ for (i = normalize(s, tree->mtt_step_size); i < s + length;
+ i += tree->mtt_step_size)
+ if (_add_entry(tree, i, tag, NULL))
+ return -ENOMEM;
+ return 0;
+}
+
+/* Ensure that future operations on given range of tree will not malloc */
+static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
+{
+ u64 start, end, s;
+ int count, i, used = 0, status = -ENOMEM;
+ struct pnfs_inval_tracking **storage;
+
+ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
+ start = normalize(offset, tree->mtt_step_size);
+ end = normalize_up(offset + length, tree->mtt_step_size);
+ count = (int)(end - start) / (int)tree->mtt_step_size;
+
+ /* Pre-malloc what memory we might need */
+ storage = kmalloc(sizeof(*storage) * count, GFP_NOFS);
+ if (!storage)
+ return -ENOMEM;
+ for (i = 0; i < count; i++) {
+ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
+ GFP_NOFS);
+ if (!storage[i])
+ goto out_cleanup;
+ }
+
+ /* Now need lock - HOW??? */
+
+ for (s = start; s < end; s += tree->mtt_step_size)
+ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
+
+ /* Unlock - HOW??? */
+ status = 0;
+
+ out_cleanup:
+ for (i = used; i < count; i++) {
+ if (!storage[i])
+ break;
+ kfree(storage[i]);
+ }
+ kfree(storage);
+ return status;
+}
+
+static void set_needs_init(sector_t *array, sector_t offset)
+{
+ sector_t *p = array;
+
+ dprintk("%s enter\n", __func__);
+ if (!p)
+ return;
+ while (*p < offset)
+ p++;
+ if (*p == offset)
+ return;
+ else if (*p == ~0) {
+ *p++ = offset;
+ *p = ~0;
+ return;
+ } else {
+ sector_t *save = p;
+ dprintk("%s Adding %llu\n", __func__, (u64)offset);
+ while (*p != ~0)
+ p++;
+ p++;
+ memmove(save + 1, save, (char *)p - (char *)save);
+ *save = offset;
+ return;
+ }
+}
+
+/* We are relying on page lock to serialize this */
+int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect)
+{
+ int rv;
+
+ spin_lock(&marks->im_lock);
+ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
+ spin_unlock(&marks->im_lock);
+ return rv;
+}
+
+/* Marks sectors in [offest, offset_length) as having been initialized.
+ * All lengths are step-aligned, where step is min(pagesize, blocksize).
+ * Notes where partial block is initialized, and helps prepare it for
+ * complete initialization later.
+ */
+/* Currently assumes offset is page-aligned */
+int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
+ sector_t offset, sector_t length,
+ sector_t **pages)
+{
+ sector_t s, start, end;
+ sector_t *array = NULL; /* Pages to mark */
+
+ dprintk("%s(offset=%llu,len=%llu) enter\n",
+ __func__, (u64)offset, (u64)length);
+ s = max((sector_t) 3,
+ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
+ dprintk("%s set max=%llu\n", __func__, (u64)s);
+ if (pages) {
+ array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
+ if (!array)
+ goto outerr;
+ array[0] = ~0;
+ }
+
+ start = normalize(offset, marks->im_block_size);
+ end = normalize_up(offset + length, marks->im_block_size);
+ if (_preload_range(&marks->im_tree, start, end - start))
+ goto outerr;
+
+ spin_lock(&marks->im_lock);
+
+ for (s = normalize_up(start, PAGE_CACHE_SECTORS);
+ s < offset; s += PAGE_CACHE_SECTORS) {
+ dprintk("%s pre-area pages\n", __func__);
+ /* Portion of used block is not initialized */
+ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
+ set_needs_init(array, s);
+ }
+ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
+ goto out_unlock;
+ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
+ s < end; s += PAGE_CACHE_SECTORS) {
+ dprintk("%s post-area pages\n", __func__);
+ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
+ set_needs_init(array, s);
+ }
+
+ spin_unlock(&marks->im_lock);
+
+ if (pages) {
+ if (array[0] == ~0) {
+ kfree(array);
+ *pages = NULL;
+ } else
+ *pages = array;
+ }
+ return 0;
+
+ out_unlock:
+ spin_unlock(&marks->im_lock);
+ outerr:
+ if (pages) {
+ kfree(array);
+ *pages = NULL;
+ }
+ return -ENOMEM;
+}
+
static void print_bl_extent(struct pnfs_block_extent *be)
{
dprintk("PRINT EXTENT extent %p\n", be);
--
1.7.4.1
From: Andy Adamson <[email protected]>
The block driver uses GETDEVICELIST
Signed-off-by: Andy Adamson <[email protected]>
[pass struct nfs_server * to getdevicelist]
[get machince creds for getdevicelist]
[fix getdevicelist decode sizing]
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4proc.c | 48 ++++++++++++++++++
fs/nfs/nfs4xdr.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++
fs/nfs/pnfs.h | 12 ++++
include/linux/nfs4.h | 1 +
include/linux/nfs_xdr.h | 11 ++++
5 files changed, 200 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 93ef776..8ceda46 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5836,6 +5836,54 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
return status;
}
+/*
+ * Retrieve the list of Data Server devices from the MDS.
+ */
+static int _nfs4_getdevicelist(struct nfs_server *server,
+ const struct nfs_fh *fh,
+ struct pnfs_devicelist *devlist)
+{
+ struct nfs4_getdevicelist_args args = {
+ .fh = fh,
+ .layoutclass = server->pnfs_curr_ld->id,
+ };
+ struct nfs4_getdevicelist_res res = {
+ .devlist = devlist,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ int status;
+
+ dprintk("--> %s\n", __func__);
+ status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
+ &res.seq_res, 0);
+ dprintk("<-- %s status=%d\n", __func__, status);
+ return status;
+}
+
+int nfs4_proc_getdevicelist(struct nfs_server *server,
+ const struct nfs_fh *fh,
+ struct pnfs_devicelist *devlist)
+{
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = nfs4_handle_exception(server,
+ _nfs4_getdevicelist(server, fh, devlist),
+ &exception);
+ } while (exception.retry);
+
+ dprintk("%s: err=%d, num_devs=%u\n", __func__,
+ err, devlist->num_devs);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
+
static int
_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
{
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c191a9b..a82dd40 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -314,6 +314,17 @@ static int nfs4_stat_to_errno(int);
XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
+#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \
+ encode_verifier_maxsz)
+#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \
+ 2 /* nfs_cookie4 gdlr_cookie */ + \
+ decode_verifier_maxsz \
+ /* verifier4 gdlr_verifier */ + \
+ 1 /* gdlr_deviceid_list count */ + \
+ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
+ NFS4_DEVICEID4_SIZE) \
+ /* gdlr_deviceid_list */ + \
+ 1 /* bool gdlr_eof */)
#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
@@ -748,6 +759,14 @@ static int nfs4_stat_to_errno(int);
#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_reclaim_complete_maxsz)
+#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getdevicelist_maxsz)
+#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getdevicelist_maxsz)
#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz +\
encode_getdeviceinfo_maxsz)
@@ -1855,6 +1874,26 @@ static void encode_sequence(struct xdr_stream *xdr,
#ifdef CONFIG_NFS_V4_1
static void
+encode_getdevicelist(struct xdr_stream *xdr,
+ const struct nfs4_getdevicelist_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+ nfs4_verifier dummy = {
+ .data = "dummmmmy",
+ };
+
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(OP_GETDEVICELIST);
+ *p++ = cpu_to_be32(args->layoutclass);
+ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
+ xdr_encode_hyper(p, 0ULL); /* cookie */
+ encode_nfs4_verifier(xdr, &dummy);
+ hdr->nops++;
+ hdr->replen += decode_getdevicelist_maxsz;
+}
+
+static void
encode_getdeviceinfo(struct xdr_stream *xdr,
const struct nfs4_getdeviceinfo_args *args,
struct compound_hdr *hdr)
@@ -2775,6 +2814,24 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
}
/*
+ * Encode GETDEVICELIST request
+ */
+static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs4_getdevicelist_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getdevicelist(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
* Encode GETDEVICEINFO request
*/
static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -5268,6 +5325,50 @@ out_overflow:
}
#if defined(CONFIG_NFS_V4_1)
+/*
+ * TODO: Need to handle case when EOF != true;
+ */
+static int decode_getdevicelist(struct xdr_stream *xdr,
+ struct pnfs_devicelist *res)
+{
+ __be32 *p;
+ int status, i;
+ struct nfs_writeverf verftemp;
+
+ status = decode_op_hdr(xdr, OP_GETDEVICELIST);
+ if (status)
+ return status;
+
+ p = xdr_inline_decode(xdr, 8 + 8 + 4);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ /* TODO: Skip cookie for now */
+ p += 2;
+
+ /* Read verifier */
+ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8);
+
+ res->num_devs = be32_to_cpup(p);
+
+ dprintk("%s: num_dev %d\n", __func__, res->num_devs);
+
+ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM)
+ return -NFS4ERR_REP_TOO_BIG;
+
+ p = xdr_inline_decode(xdr,
+ res->num_devs * NFS4_DEVICEID4_SIZE + 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ for (i = 0; i < res->num_devs; i++)
+ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
+ NFS4_DEVICEID4_SIZE);
+ res->eof = be32_to_cpup(p);
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
static int decode_getdeviceinfo(struct xdr_stream *xdr,
struct pnfs_device *pdev)
@@ -6542,6 +6643,32 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
}
/*
+ * Decode GETDEVICELIST response
+ */
+static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs4_getdevicelist_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ dprintk("encoding getdevicelist!\n");
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status != 0)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status != 0)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status != 0)
+ goto out;
+ status = decode_getdevicelist(xdr, res->devlist);
+out:
+ return status;
+}
+
+/*
* Decode GETDEVINFO response
*/
static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -6902,6 +7029,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time),
PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
+ PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 078670d..ffea314 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -133,14 +133,26 @@ struct pnfs_device {
unsigned int layout_type;
unsigned int mincount;
struct page **pages;
+ void *area;
unsigned int pgbase;
unsigned int pglen;
};
+#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
+
+struct pnfs_devicelist {
+ unsigned int eof;
+ unsigned int num_devs;
+ struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
+};
+
extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
/* nfs4proc.c */
+extern int nfs4_proc_getdevicelist(struct nfs_server *server,
+ const struct nfs_fh *fh,
+ struct pnfs_devicelist *devlist);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev);
extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index a3c4bc8..76f99e8 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -566,6 +566,7 @@ enum {
NFSPROC4_CLNT_SECINFO_NO_NAME,
NFSPROC4_CLNT_TEST_STATEID,
NFSPROC4_CLNT_FREE_STATEID,
+ NFSPROC4_CLNT_GETDEVICELIST,
};
/* nfs41 types */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 5b11595..a07b682 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -235,6 +235,17 @@ struct nfs4_layoutget {
gfp_t gfp_flags;
};
+struct nfs4_getdevicelist_args {
+ const struct nfs_fh *fh;
+ u32 layoutclass;
+ struct nfs4_sequence_args seq_args;
+};
+
+struct nfs4_getdevicelist_res {
+ struct pnfs_devicelist *devlist;
+ struct nfs4_sequence_res seq_res;
+};
+
struct nfs4_getdeviceinfo_args {
struct pnfs_device *pdev;
struct nfs4_sequence_args seq_args;
--
1.7.4.1
From: Fred Isaman <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/extents.c | 47 ++++++++++++++++++++++++++++++++++++++++++
1 files changed, 47 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 4b58412..ed62cd8 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -501,3 +501,50 @@ bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
print_bl_extent(ret);
return ret;
}
+
+/* Helper function to set_to_rw that initialize a new extent */
+static void
+_prep_new_extent(struct pnfs_block_extent *new,
+ struct pnfs_block_extent *orig,
+ sector_t offset, sector_t length, int state)
+{
+ kref_init(&new->be_refcnt);
+ /* don't need to INIT_LIST_HEAD(&new->be_node) */
+ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
+ new->be_mdev = orig->be_mdev;
+ new->be_f_offset = offset;
+ new->be_length = length;
+ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
+ new->be_state = state;
+ new->be_inval = orig->be_inval;
+}
+
+/* Tries to merge be with extent in front of it in list.
+ * Frees storage if not used.
+ */
+static struct pnfs_block_extent *
+_front_merge(struct pnfs_block_extent *be, struct list_head *head,
+ struct pnfs_block_extent *storage)
+{
+ struct pnfs_block_extent *prev;
+
+ if (!storage)
+ goto no_merge;
+ if (&be->be_node == head || be->be_node.prev == head)
+ goto no_merge;
+ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
+ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
+ !extents_consistent(prev, be))
+ goto no_merge;
+ _prep_new_extent(storage, prev, prev->be_f_offset,
+ prev->be_length + be->be_length, prev->be_state);
+ list_replace(&prev->be_node, &storage->be_node);
+ bl_put_extent(prev);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ return storage;
+
+ no_merge:
+ kfree(storage);
+ return be;
+}
--
1.7.4.1
From: Fred Isaman <[email protected]>
Define a configuration variable to enable/disable compilation of the
block driver code.
Add the minimal structure for a pnfs block layout driver, and empty
list-heads that will hold the extent data
[pnfsblock: make NFS_V4_1 select PNFS_BLOCK]
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[pnfs-block: fix CONFIG_PNFS_BLOCK dependencies]
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[pnfsblock: SQUASHME: port block layout code]
Signed-off-by: Peng Tao <[email protected]>
[pnfsblock: SQUASHME: adjust to API change]
Signed-off-by: Fred Isaman <[email protected]>
[pnfs: move pnfs_layout_type inline in nfs_inode]
Signed-off-by: Benny Halevy <[email protected]>
[blocklayout: encode_layoutcommit implementation]
Signed-off-by: Boaz Harrosh <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[pnfsblock: layout alloc and free]
Signed-off-by: Fred Isaman <[email protected]>
[pnfs: move pnfs_layout_type inline in nfs_inode]
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[pnfsblock: define module alias]
Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/Kconfig | 8 ++-
fs/nfs/Makefile | 1 +
fs/nfs/blocklayout/Makefile | 5 +
fs/nfs/blocklayout/blocklayout.c | 175 ++++++++++++++++++++++++++++++++++++++
fs/nfs/blocklayout/blocklayout.h | 91 ++++++++++++++++++++
5 files changed, 279 insertions(+), 1 deletions(-)
create mode 100644 fs/nfs/blocklayout/Makefile
create mode 100644 fs/nfs/blocklayout/blocklayout.c
create mode 100644 fs/nfs/blocklayout/blocklayout.h
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 2cde5d9..be02077 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -79,15 +79,21 @@ config NFS_V4_1
depends on NFS_FS && NFS_V4 && EXPERIMENTAL
select SUNRPC_BACKCHANNEL
select PNFS_FILE_LAYOUT
+ select PNFS_BLOCK
+ select MD
+ select BLK_DEV_DM
help
This option enables support for minor version 1 of the NFSv4 protocol
- (RFC 5661) in the kernel's NFS client.
+ (RFC 5661 and RFC 5663) in the kernel's NFS client.
If unsure, say N.
config PNFS_FILE_LAYOUT
tristate
+config PNFS_BLOCK
+ tristate
+
config PNFS_OBJLAYOUT
tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 6a34f7d..b58613d 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
+obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
new file mode 100644
index 0000000..6bf49cd
--- /dev/null
+++ b/fs/nfs/blocklayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS block layout driver kernel module
+#
+obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
+blocklayoutdriver-objs := blocklayout.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
new file mode 100644
index 0000000..55a2a95
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -0,0 +1,175 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.c
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <[email protected]>
+ * Fred Isaman <[email protected]>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Andy Adamson <[email protected]>");
+MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
+
+static enum pnfs_try_status
+bl_read_pagelist(struct nfs_read_data *rdata)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
+static enum pnfs_try_status
+bl_write_pagelist(struct nfs_write_data *wdata,
+ int sync)
+{
+ return PNFS_NOT_ATTEMPTED;
+}
+
+/* STUB */
+static void
+release_extents(struct pnfs_block_layout *bl,
+ struct pnfs_layout_range *range)
+{
+ return;
+}
+
+/* STUB */
+static void
+release_inval_marks(struct pnfs_inval_markings *marks)
+{
+ return;
+}
+
+static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+
+ dprintk("%s enter\n", __func__);
+ release_extents(bl, NULL);
+ release_inval_marks(&bl->bl_inval);
+ kfree(bl);
+}
+
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags)
+{
+ struct pnfs_block_layout *bl;
+
+ dprintk("%s enter\n", __func__);
+ bl = kzalloc(sizeof(*bl), gfp_flags);
+ if (!bl)
+ return NULL;
+ spin_lock_init(&bl->bl_ext_lock);
+ INIT_LIST_HEAD(&bl->bl_extents[0]);
+ INIT_LIST_HEAD(&bl->bl_extents[1]);
+ INIT_LIST_HEAD(&bl->bl_commit);
+ INIT_LIST_HEAD(&bl->bl_committing);
+ bl->bl_count = 0;
+ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
+ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
+ return &bl->bl_layout;
+}
+
+static void
+bl_free_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+
+static struct pnfs_layout_segment *
+bl_alloc_lseg(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
+{
+ return NULL;
+}
+
+static void
+bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
+ const struct nfs4_layoutcommit_args *arg)
+{
+}
+
+static void
+bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutcommit_data *lcdata)
+{
+}
+
+static int
+bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
+{
+ dprintk("%s enter\n", __func__);
+ return 0;
+}
+
+static int
+bl_clear_layoutdriver(struct nfs_server *server)
+{
+ dprintk("%s enter\n", __func__);
+ return 0;
+}
+
+static struct pnfs_layoutdriver_type blocklayout_type = {
+ .id = LAYOUT_BLOCK_VOLUME,
+ .name = "LAYOUT_BLOCK_VOLUME",
+ .read_pagelist = bl_read_pagelist,
+ .write_pagelist = bl_write_pagelist,
+ .alloc_layout_hdr = bl_alloc_layout_hdr,
+ .free_layout_hdr = bl_free_layout_hdr,
+ .alloc_lseg = bl_alloc_lseg,
+ .free_lseg = bl_free_lseg,
+ .encode_layoutcommit = bl_encode_layoutcommit,
+ .cleanup_layoutcommit = bl_cleanup_layoutcommit,
+ .set_layoutdriver = bl_set_layoutdriver,
+ .clear_layoutdriver = bl_clear_layoutdriver,
+};
+
+static int __init nfs4blocklayout_init(void)
+{
+ int ret;
+
+ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
+
+ ret = pnfs_register_layoutdriver(&blocklayout_type);
+ return ret;
+}
+
+static void __exit nfs4blocklayout_exit(void)
+{
+ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
+ __func__);
+
+ pnfs_unregister_layoutdriver(&blocklayout_type);
+}
+
+MODULE_ALIAS("nfs-layouttype4-3");
+
+module_init(nfs4blocklayout_init);
+module_exit(nfs4blocklayout_exit);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
new file mode 100644
index 0000000..bda87e0
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -0,0 +1,91 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <[email protected]>
+ * Fred Isaman <[email protected]>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
+#define FS_NFS_NFS4BLOCKLAYOUT_H
+
+#include <linux/device-mapper.h>
+#include <linux/nfs_fs.h>
+#include "../pnfs.h"
+
+enum exstate4 {
+ PNFS_BLOCK_READWRITE_DATA = 0,
+ PNFS_BLOCK_READ_DATA = 1,
+ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
+ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
+};
+
+struct pnfs_inval_markings {
+ /* STUB */
+};
+
+/* sector_t fields are all in 512-byte sectors */
+struct pnfs_block_extent {
+ struct kref be_refcnt;
+ struct list_head be_node; /* link into lseg list */
+ struct nfs4_deviceid be_devid; /* STUB - removable??? */
+ struct block_device *be_mdev;
+ sector_t be_f_offset; /* the starting offset in the file */
+ sector_t be_length; /* the size of the extent */
+ sector_t be_v_offset; /* the starting offset in the volume */
+ enum exstate4 be_state; /* the state of this extent */
+ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
+};
+
+static inline void
+INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
+{
+ /* STUB */
+}
+
+enum extentclass4 {
+ RW_EXTENT = 0, /* READWRTE and INVAL */
+ RO_EXTENT = 1, /* READ and NONE */
+ EXTENT_LISTS = 2,
+};
+
+struct pnfs_block_layout {
+ struct pnfs_layout_hdr bl_layout;
+ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
+ spinlock_t bl_ext_lock; /* Protects list manipulation */
+ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
+ struct list_head bl_commit; /* Needs layout commit */
+ struct list_head bl_committing; /* Layout committing */
+ unsigned int bl_count; /* entries in bl_commit */
+ sector_t bl_blocksize; /* Server blocksize in sectors */
+};
+
+static inline struct pnfs_block_layout *BLK_LO2EXT(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct pnfs_block_layout, bl_layout);
+}
+
+#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
--
1.7.4.1
From: Peng Tao <[email protected]>
Save it in layout header instead of in every segment.
Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/pnfs.c | 12 ++++++------
fs/nfs/pnfs.h | 1 -
2 files changed, 6 insertions(+), 7 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index dae19dd..fbebd2a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -211,6 +211,7 @@ destroy_layout_hdr(struct pnfs_layout_hdr *lo)
dprintk("%s: freeing layout cache %p\n", __func__, lo);
BUG_ON(!list_empty(&lo->plh_layouts));
NFS_I(lo->plh_inode)->layout = NULL;
+ put_rpccred(lo->plh_lc_cred);
pnfs_free_layout_hdr(lo);
}
@@ -1007,6 +1008,10 @@ pnfs_update_layout(struct inode *ino,
list_del_init(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
}
+ if (first) {
+ lo->plh_lc_cred =
+ get_rpccred(ctx->state->owner->so_cred);
+ }
atomic_dec(&lo->plh_outstanding);
put_layout_hdr(lo);
out:
@@ -1386,8 +1391,6 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
spin_lock(&nfsi->vfs_inode.i_lock);
if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
mark_as_dirty = true;
- nfsi->layout->plh_lc_cred =
- get_rpccred(wdata->args.context->state->owner->so_cred);
dprintk("%s: Set layoutcommit for inode %lu ",
__func__, wdata->inode->i_ino);
}
@@ -1421,7 +1424,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{
struct nfs4_layoutcommit_data *data;
struct nfs_inode *nfsi = NFS_I(inode);
- struct rpc_cred *cred;
loff_t end_pos;
int status = 0;
@@ -1449,16 +1451,14 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
pnfs_list_write_lseg(inode, &data->lseg_list);
end_pos = nfsi->layout->plh_lwb;
- cred = nfsi->layout->plh_lc_cred;
nfsi->layout->plh_lwb = 0;
- nfsi->layout->plh_lc_cred = NULL;
memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
sizeof(nfsi->layout->plh_stateid.data));
spin_unlock(&inode->i_lock);
data->args.inode = inode;
- data->cred = cred;
+ data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
nfs_fattr_init(&data->fattr);
data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
data->res.fattr = &data->fattr;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 5f1b532..bddd8b9 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -46,7 +46,6 @@ struct pnfs_layout_segment {
atomic_t pls_refcount;
unsigned long pls_flags;
struct pnfs_layout_hdr *pls_layout;
- struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
};
enum pnfs_try_status {
--
1.7.4.1
> -----Original Message-----
> From: Benny Halevy [mailto:[email protected]]
> Sent: Monday, July 25, 2011 10:50 AM
> To: Myklebust, Trond
> Cc: Jim Rees; [email protected]; peter honeyman
> Subject: Re: [PATCH v2 07/25] pnfsblock: add blocklayout Kconfig
> option, Makefile, and stubs
>
> On 2011-07-25 10:38, Myklebust, Trond wrote:
> >> -----Original Message-----
> >> From: Benny Halevy [mailto:[email protected]]
> >> Sent: Monday, July 25, 2011 10:31 AM
> >> To: Jim Rees
> >> Cc: Myklebust, Trond; [email protected]; peter honeyman
> >> Subject: Re: [PATCH v2 07/25] pnfsblock: add blocklayout Kconfig
> >> option, Makefile, and stubs
> >>
> >> On 2011-07-21 15:34, Jim Rees wrote:
> >>> From: Fred Isaman <[email protected]>
> >>>
> >>> Define a configuration variable to enable/disable compilation of
> the
> >>> block driver code.
> >>>
> >>> Add the minimal structure for a pnfs block layout driver, and
empty
> >>> list-heads that will hold the extent data
> >>>
> >>> [pnfsblock: make NFS_V4_1 select PNFS_BLOCK]
> >>> Signed-off-by: Peng Tao <[email protected]>
> >>> Signed-off-by: Fred Isaman <[email protected]>
> >>> Signed-off-by: Benny Halevy <[email protected]>
> >>> [pnfs-block: fix CONFIG_PNFS_BLOCK dependencies]
> >>> Signed-off-by: Benny Halevy <[email protected]>
> >>> Signed-off-by: Benny Halevy <[email protected]>
> >>> [pnfsblock: SQUASHME: port block layout code]
> >>> Signed-off-by: Peng Tao <[email protected]>
> >>> [pnfsblock: SQUASHME: adjust to API change]
> >>> Signed-off-by: Fred Isaman <[email protected]>
> >>> [pnfs: move pnfs_layout_type inline in nfs_inode]
> >>> Signed-off-by: Benny Halevy <[email protected]>
> >>> [blocklayout: encode_layoutcommit implementation]
> >>> Signed-off-by: Boaz Harrosh <[email protected]>
> >>> Signed-off-by: Benny Halevy <[email protected]>
> >>> Signed-off-by: Benny Halevy <[email protected]>
> >>> [pnfsblock: layout alloc and free]
> >>> Signed-off-by: Fred Isaman <[email protected]>
> >>> [pnfs: move pnfs_layout_type inline in nfs_inode]
> >>> Signed-off-by: Benny Halevy <[email protected]>
> >>> Signed-off-by: Benny Halevy <[email protected]>
> >>> [pnfsblock: define module alias]
> >>> Signed-off-by: Peng Tao <[email protected]>
> >>> ---
> >>> fs/nfs/Kconfig | 8 ++-
> >>> fs/nfs/Makefile | 1 +
> >>> fs/nfs/blocklayout/Makefile | 5 +
> >>> fs/nfs/blocklayout/blocklayout.c | 175
> >> ++++++++++++++++++++++++++++++++++++++
> >>> fs/nfs/blocklayout/blocklayout.h | 91 ++++++++++++++++++++
> >>> 5 files changed, 279 insertions(+), 1 deletions(-)
> >>> create mode 100644 fs/nfs/blocklayout/Makefile
> >>> create mode 100644 fs/nfs/blocklayout/blocklayout.c
> >>> create mode 100644 fs/nfs/blocklayout/blocklayout.h
> >>>
> >>> diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
> >>> index 2cde5d9..be02077 100644
> >>> --- a/fs/nfs/Kconfig
> >>> +++ b/fs/nfs/Kconfig
> >>> @@ -79,15 +79,21 @@ config NFS_V4_1
> >>> depends on NFS_FS && NFS_V4 && EXPERIMENTAL
> >>> select SUNRPC_BACKCHANNEL
> >>> select PNFS_FILE_LAYOUT
> >>> + select PNFS_BLOCK
> >>> + select MD
> >>> + select BLK_DEV_DM
> >>
> >> Why is PNFS_BLOCK enabled automatically in all cases?
> >> That renders the use of modules for layout drivers totally useless.
> >> I sort of understand that for PNFS_FILE_LAYOUT (when my
> >> arm is twisted really hard behind my back :) since it
> >> is an integral part of RFC5661 but what's the justification
> >> for PNFS_BLOCK? and why blocks and not objects?
> >
> > The question is rather why did objects add a selectable compile
> option?
>
> Just good citizenship :)
>
> > What is the point of not compiling a given layout driver if all the
> > dependencies are met?
>
> Reducing build times...
> Building a smaller kernel when modules are disabled...
You can add a line with
depends on m
to ensure that it is always compiled as a module. I think that might be
a good thing until we have nailed down all the issues with pNFS.
> We're fine in terms of memory consumption when CONFIG_MODULES=y since
> the
> layout driver is loaded on demand but shouldn't be worried about
> the other case?
>
> >
> > IOW: The only thing I'd change above is the select MD and select
> > BLK_DEV_DM: I'd prefer something like
> >
> > config PNFS_BLOCK
> > depends on NFS_V4_1 && MD && BLK_DEV_DM
> > default y
>
> This is closer to the original version.
> However, selecting MD and BLK_DEV_DM was proven useful to
automatically
> take
> care of the module dependencies without having to dive into details.
Yes, but since the MD is a completely different layer that is not under
our control (well, OK, Neil is still an NFS maintainer and an MD
maintainer) then I'd prefer to leave it as a dependency.
We can always add something like
comment
depends on NFS_V4_1 && !BLK_DEV_DM
Please enable BLK_DEV_MD if you wish to enable the pNFS block
driver.
Cheers
Trond
From: Andy Adamson <[email protected]>
This gives layout driver a chance to cleanup structures they put in at
encode_layoutcommit.
Signed-off-by: Andy Adamson <[email protected]>
[fixup layout header pointer for layoutcommit]
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/nfs4proc.c | 1 +
fs/nfs/nfs4xdr.c | 3 ++-
fs/nfs/pnfs.c | 10 ++++++++++
fs/nfs/pnfs.h | 5 +++++
include/linux/nfs_xdr.h | 1 +
5 files changed, 19 insertions(+), 1 deletions(-)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e02f545..795033c5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5965,6 +5965,7 @@ static void nfs4_layoutcommit_release(void *calldata)
struct nfs4_layoutcommit_data *data = calldata;
struct pnfs_layout_segment *lseg, *tmp;
+ pnfs_cleanup_layoutcommit(data->args.inode, data);
/* Matched by references in pnfs_set_layoutcommit */
list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) {
list_del_init(&lseg->pls_lc_list);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5ce3c64..07c41b2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1991,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
*p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
/* Only whole file layouts */
p = xdr_encode_hyper(p, 0); /* offset */
- p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */
+ p = xdr_encode_hyper(p, args->lastbytewritten+1); /* length */
*p++ = cpu_to_be32(0); /* reclaim */
p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
*p++ = cpu_to_be32(1); /* newoffset = TRUE */
@@ -5596,6 +5596,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
int status;
status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+ res->status = status;
if (status)
return status;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index fbebd2a..3b20753 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1411,6 +1411,16 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
}
EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
+void pnfs_cleanup_layoutcommit(struct inode *inode,
+ struct nfs4_layoutcommit_data *data)
+{
+ struct nfs_server *nfss = NFS_SERVER(inode);
+
+ if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
+ nfss->pnfs_curr_ld->cleanup_layoutcommit(NFS_I(inode)->layout,
+ data);
+}
+
/*
* For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
* NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index bddd8b9..f271425 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -113,6 +113,9 @@ struct pnfs_layoutdriver_type {
struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args);
+ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+ struct nfs4_layoutcommit_data *data);
+
void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args);
@@ -196,6 +199,8 @@ void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
+void pnfs_cleanup_layoutcommit(struct inode *inode,
+ struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
int _pnfs_return_layout(struct inode *);
int pnfs_ld_write_done(struct nfs_write_data *);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 94f27e5..569ea5b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -269,6 +269,7 @@ struct nfs4_layoutcommit_res {
struct nfs_fattr *fattr;
const struct nfs_server *server;
struct nfs4_sequence_res seq_res;
+ int status;
};
struct nfs4_layoutcommit_data {
--
1.7.4.1
From: Peng Tao <[email protected]>
Some layout drivers like block will have multiple segments. Generic code
should be able to handle it. Layoutcommit takes a list of segments and last
write offset is saved at inode level.
Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/nfs4filelayout.c | 2 +-
fs/nfs/nfs4proc.c | 8 ++++++-
fs/nfs/pnfs.c | 50 ++++++++++++++++++++++++----------------------
fs/nfs/pnfs.h | 5 +++-
include/linux/nfs_xdr.h | 2 +-
5 files changed, 39 insertions(+), 28 deletions(-)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index be93a62..e8915d4 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -170,7 +170,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
pnfs_set_layoutcommit(wdata);
dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
- (unsigned long) wdata->lseg->pls_end_pos);
+ (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb);
}
/*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8ceda46..784c1a2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5962,9 +5962,15 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
static void nfs4_layoutcommit_release(void *calldata)
{
struct nfs4_layoutcommit_data *data = calldata;
+ struct pnfs_layout_segment *lseg, *tmp;
/* Matched by references in pnfs_set_layoutcommit */
- put_lseg(data->lseg);
+ list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) {
+ list_del_init(&lseg->pls_lc_list);
+ if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
+ &lseg->pls_flags))
+ put_lseg(lseg);
+ }
put_rpccred(data->cred);
kfree(data);
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8e72724..dae19dd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -236,6 +236,7 @@ static void
init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
{
INIT_LIST_HEAD(&lseg->pls_list);
+ INIT_LIST_HEAD(&lseg->pls_lc_list);
atomic_set(&lseg->pls_refcount, 1);
smp_mb();
set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
@@ -1362,16 +1363,17 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
/*
- * Currently there is only one (whole file) write lseg.
+ * There can be multiple RW segments.
*/
-static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode)
+static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
{
- struct pnfs_layout_segment *lseg, *rv = NULL;
+ struct pnfs_layout_segment *lseg;
- list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
- if (lseg->pls_range.iomode == IOMODE_RW)
- rv = lseg;
- return rv;
+ list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
+ if (lseg->pls_range.iomode == IOMODE_RW &&
+ test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+ list_add(&lseg->pls_lc_list, listp);
+ }
}
void
@@ -1383,17 +1385,21 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
spin_lock(&nfsi->vfs_inode.i_lock);
if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
- /* references matched in nfs4_layoutcommit_release */
- get_lseg(wdata->lseg);
- wdata->lseg->pls_lc_cred =
- get_rpccred(wdata->args.context->state->owner->so_cred);
mark_as_dirty = true;
+ nfsi->layout->plh_lc_cred =
+ get_rpccred(wdata->args.context->state->owner->so_cred);
dprintk("%s: Set layoutcommit for inode %lu ",
__func__, wdata->inode->i_ino);
}
- if (end_pos > wdata->lseg->pls_end_pos)
- wdata->lseg->pls_end_pos = end_pos;
+ if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) {
+ /* references matched in nfs4_layoutcommit_release */
+ get_lseg(wdata->lseg);
+ }
+ if (end_pos > nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = end_pos;
spin_unlock(&nfsi->vfs_inode.i_lock);
+ dprintk("%s: lseg %p end_pos %llu\n",
+ __func__, wdata->lseg, nfsi->layout->plh_lwb);
/* if pnfs_layoutcommit_inode() runs between inode locks, the next one
* will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
@@ -1415,7 +1421,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{
struct nfs4_layoutcommit_data *data;
struct nfs_inode *nfsi = NFS_I(inode);
- struct pnfs_layout_segment *lseg;
struct rpc_cred *cred;
loff_t end_pos;
int status = 0;
@@ -1433,29 +1438,26 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
goto out;
}
+ INIT_LIST_HEAD(&data->lseg_list);
spin_lock(&inode->i_lock);
if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
spin_unlock(&inode->i_lock);
kfree(data);
goto out;
}
- /*
- * Currently only one (whole file) write lseg which is referenced
- * in pnfs_set_layoutcommit and will be found.
- */
- lseg = pnfs_list_write_lseg(inode);
- end_pos = lseg->pls_end_pos;
- cred = lseg->pls_lc_cred;
- lseg->pls_end_pos = 0;
- lseg->pls_lc_cred = NULL;
+ pnfs_list_write_lseg(inode, &data->lseg_list);
+
+ end_pos = nfsi->layout->plh_lwb;
+ cred = nfsi->layout->plh_lc_cred;
+ nfsi->layout->plh_lwb = 0;
+ nfsi->layout->plh_lc_cred = NULL;
memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
sizeof(nfsi->layout->plh_stateid.data));
spin_unlock(&inode->i_lock);
data->args.inode = inode;
- data->lseg = lseg;
data->cred = cred;
nfs_fattr_init(&data->fattr);
data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 23d8267..5f1b532 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -36,16 +36,17 @@
enum {
NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
NFS_LSEG_ROC, /* roc bit received from server */
+ NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */
};
struct pnfs_layout_segment {
struct list_head pls_list;
+ struct list_head pls_lc_list;
struct pnfs_layout_range pls_range;
atomic_t pls_refcount;
unsigned long pls_flags;
struct pnfs_layout_hdr *pls_layout;
struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
- loff_t pls_end_pos; /* LAYOUTCOMMIT write end */
};
enum pnfs_try_status {
@@ -128,6 +129,8 @@ struct pnfs_layout_hdr {
unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
u32 plh_barrier; /* ignore lower seqids */
unsigned long plh_flags;
+ loff_t plh_lwb; /* last write byte for layoutcommit */
+ struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
struct inode *plh_inode;
};
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index a07b682..21f333e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -273,7 +273,7 @@ struct nfs4_layoutcommit_res {
struct nfs4_layoutcommit_data {
struct rpc_task task;
struct nfs_fattr fattr;
- struct pnfs_layout_segment *lseg;
+ struct list_head lseg_list;
struct rpc_cred *cred;
struct nfs4_layoutcommit_args args;
struct nfs4_layoutcommit_res res;
--
1.7.4.1
From: Fred <[email protected]>
Implement bl_find_get_extent(), one of the core extent manipulation
routines.
[pnfsblock: Lookup list entry of layouts and tags in reverse order]
Signed-off-by: Zhang Jingwang <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
pnfsblock: fix print format warnings for sector_t and size_t
gcc spews warnings about these on x86_64, e.g.:
fs/nfs/blocklayout/blocklayout.c:74: warning: format ‘%Lu’ expects type ‘long long unsigned int’, but argument 2 has type ‘sector_t’
fs/nfs/blocklayout/blocklayout.c:388: warning: format ‘%d’ expects type ‘int’, but argument 5 has type ‘size_t’
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/blocklayout.h | 3 ++
fs/nfs/blocklayout/extents.c | 47 ++++++++++++++++++++++++++++++++++++++
2 files changed, 50 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index ed6253e..05f2e54 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -153,6 +153,9 @@ int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
void free_block_dev(struct pnfs_block_dev *bdev);
/* extents.c */
+struct pnfs_block_extent *
+bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent **cow_read);
void bl_put_extent(struct pnfs_block_extent *be);
struct pnfs_block_extent *alloc_extent(void);
int bl_add_merge_extent(struct pnfs_block_layout *bl,
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index d76bb43..086ce36 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -201,3 +201,50 @@ bl_add_merge_extent(struct pnfs_block_layout *bl,
bl_put_extent(new);
return -EIO;
}
+
+/* Returns extent, or NULL. If a second READ extent exists, it is returned
+ * in cow_read, if given.
+ *
+ * The extents are kept in two seperate ordered lists, one for READ and NONE,
+ * one for READWRITE and INVALID. Within each list, we assume:
+ * 1. Extents are ordered by file offset.
+ * 2. For any given isect, there is at most one extents that matches.
+ */
+struct pnfs_block_extent *
+bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
+ struct pnfs_block_extent **cow_read)
+{
+ struct pnfs_block_extent *be, *cow, *ret;
+ int i;
+
+ dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
+ cow = ret = NULL;
+ spin_lock(&bl->bl_ext_lock);
+ for (i = 0; i < EXTENT_LISTS; i++) {
+ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
+ if (isect >= be->be_f_offset + be->be_length)
+ break;
+ if (isect >= be->be_f_offset) {
+ /* We have found an extent */
+ dprintk("%s Get %p (%i)\n", __func__, be,
+ atomic_read(&be->be_refcnt.refcount));
+ kref_get(&be->be_refcnt);
+ if (!ret)
+ ret = be;
+ else if (be->be_state != PNFS_BLOCK_READ_DATA)
+ bl_put_extent(be);
+ else
+ cow = be;
+ break;
+ }
+ }
+ if (ret &&
+ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
+ break;
+ }
+ spin_unlock(&bl->bl_ext_lock);
+ if (cow_read)
+ *cow_read = cow;
+ print_bl_extent(ret);
+ return ret;
+}
--
1.7.4.1
From: Fred Isaman <[email protected]>
Replace a stub, so that extents underlying the layouts are properly
added, merged, or ignored as necessary.
Signed-off-by: Fred Isaman <[email protected]>
[pnfsblock: delete the new node before put it]
Signed-off-by: Mingyang Guo <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/blocklayout.h | 13 +++++
fs/nfs/blocklayout/extents.c | 106 ++++++++++++++++++++++++++++++++++++++
2 files changed, 119 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 7dab978..ff140e7 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -80,6 +80,14 @@ enum extentclass4 {
EXTENT_LISTS = 2,
};
+static inline int choose_list(enum exstate4 state)
+{
+ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
+ return RO_EXTENT;
+ else
+ return RW_EXTENT;
+}
+
struct pnfs_block_layout {
struct pnfs_layout_hdr bl_layout;
struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
@@ -137,5 +145,10 @@ int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
/* blocklayoutdm.c */
void free_block_dev(struct pnfs_block_dev *bdev);
+/* extents.c */
void bl_put_extent(struct pnfs_block_extent *be);
+struct pnfs_block_extent *alloc_extent(void);
+int bl_add_merge_extent(struct pnfs_block_layout *bl,
+ struct pnfs_block_extent *new);
+
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 44c3364..d76bb43 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -95,3 +95,109 @@ void print_elist(struct list_head *list)
}
dprintk("****************\n");
}
+
+static inline int
+extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
+{
+ /* Note this assumes new->be_f_offset >= old->be_f_offset */
+ return (new->be_state == old->be_state) &&
+ ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
+ ((new->be_v_offset - old->be_v_offset ==
+ new->be_f_offset - old->be_f_offset) &&
+ new->be_mdev == old->be_mdev));
+}
+
+/* Adds new to appropriate list in bl, modifying new and removing existing
+ * extents as appropriate to deal with overlaps.
+ *
+ * See bl_find_get_extent for list constraints.
+ *
+ * Refcount on new is already set. If end up not using it, or error out,
+ * need to put the reference.
+ *
+ * bl->bl_ext_lock is held by caller.
+ */
+int
+bl_add_merge_extent(struct pnfs_block_layout *bl,
+ struct pnfs_block_extent *new)
+{
+ struct pnfs_block_extent *be, *tmp;
+ sector_t end = new->be_f_offset + new->be_length;
+ struct list_head *list;
+
+ dprintk("%s enter with be=%p\n", __func__, new);
+ print_bl_extent(new);
+ list = &bl->bl_extents[choose_list(new->be_state)];
+ print_elist(list);
+
+ /* Scan for proper place to insert, extending new to the left
+ * as much as possible.
+ */
+ list_for_each_entry_safe(be, tmp, list, be_node) {
+ if (new->be_f_offset < be->be_f_offset)
+ break;
+ if (end <= be->be_f_offset + be->be_length) {
+ /* new is a subset of existing be*/
+ if (extents_consistent(be, new)) {
+ dprintk("%s: new is subset, ignoring\n",
+ __func__);
+ bl_put_extent(new);
+ return 0;
+ } else
+ goto out_err;
+ } else if (new->be_f_offset <=
+ be->be_f_offset + be->be_length) {
+ /* new overlaps or abuts existing be */
+ if (extents_consistent(be, new)) {
+ /* extend new to fully replace be */
+ new->be_length += new->be_f_offset -
+ be->be_f_offset;
+ new->be_f_offset = be->be_f_offset;
+ new->be_v_offset = be->be_v_offset;
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ } else if (new->be_f_offset !=
+ be->be_f_offset + be->be_length)
+ goto out_err;
+ }
+ }
+ /* Note that if we never hit the above break, be will not point to a
+ * valid extent. However, in that case &be->be_node==list.
+ */
+ list_add_tail(&new->be_node, &be->be_node);
+ dprintk("%s: inserting new\n", __func__);
+ print_elist(list);
+ /* Scan forward for overlaps. If we find any, extend new and
+ * remove the overlapped extent.
+ */
+ be = list_prepare_entry(new, list, be_node);
+ list_for_each_entry_safe_continue(be, tmp, list, be_node) {
+ if (end < be->be_f_offset)
+ break;
+ /* new overlaps or abuts existing be */
+ if (extents_consistent(be, new)) {
+ if (end < be->be_f_offset + be->be_length) {
+ /* extend new to fully cover be */
+ end = be->be_f_offset + be->be_length;
+ new->be_length = end - new->be_f_offset;
+ }
+ dprintk("%s: removing %p\n", __func__, be);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ } else if (end != be->be_f_offset) {
+ list_del(&new->be_node);
+ goto out_err;
+ }
+ }
+ dprintk("%s: after merging\n", __func__);
+ print_elist(list);
+ /* STUB - The per-list consistency checks have all been done,
+ * should now check cross-list consistency.
+ */
+ return 0;
+
+ out_err:
+ bl_put_extent(new);
+ return -EIO;
+}
--
1.7.4.1
On 2011-07-25 13:25, Myklebust, Trond wrote:
>> -----Original Message-----
>> From: Benny Halevy [mailto:[email protected]]
>> Sent: Monday, July 25, 2011 10:50 AM
>> To: Myklebust, Trond
>> Cc: Jim Rees; [email protected]; peter honeyman
>> Subject: Re: [PATCH v2 07/25] pnfsblock: add blocklayout Kconfig
>> option, Makefile, and stubs
>>
>> On 2011-07-25 10:38, Myklebust, Trond wrote:
>>>> -----Original Message-----
>>>> From: Benny Halevy [mailto:[email protected]]
>>>> Sent: Monday, July 25, 2011 10:31 AM
>>>> To: Jim Rees
>>>> Cc: Myklebust, Trond; [email protected]; peter honeyman
>>>> Subject: Re: [PATCH v2 07/25] pnfsblock: add blocklayout Kconfig
>>>> option, Makefile, and stubs
>>>>
>>>> On 2011-07-21 15:34, Jim Rees wrote:
>>>>> From: Fred Isaman <[email protected]>
>>>>>
>>>>> Define a configuration variable to enable/disable compilation of
>> the
>>>>> block driver code.
>>>>>
>>>>> Add the minimal structure for a pnfs block layout driver, and
> empty
>>>>> list-heads that will hold the extent data
>>>>>
>>>>> [pnfsblock: make NFS_V4_1 select PNFS_BLOCK]
>>>>> Signed-off-by: Peng Tao <[email protected]>
>>>>> Signed-off-by: Fred Isaman <[email protected]>
>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>> [pnfs-block: fix CONFIG_PNFS_BLOCK dependencies]
>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>> [pnfsblock: SQUASHME: port block layout code]
>>>>> Signed-off-by: Peng Tao <[email protected]>
>>>>> [pnfsblock: SQUASHME: adjust to API change]
>>>>> Signed-off-by: Fred Isaman <[email protected]>
>>>>> [pnfs: move pnfs_layout_type inline in nfs_inode]
>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>> [blocklayout: encode_layoutcommit implementation]
>>>>> Signed-off-by: Boaz Harrosh <[email protected]>
>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>> [pnfsblock: layout alloc and free]
>>>>> Signed-off-by: Fred Isaman <[email protected]>
>>>>> [pnfs: move pnfs_layout_type inline in nfs_inode]
>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>> Signed-off-by: Benny Halevy <[email protected]>
>>>>> [pnfsblock: define module alias]
>>>>> Signed-off-by: Peng Tao <[email protected]>
>>>>> ---
>>>>> fs/nfs/Kconfig | 8 ++-
>>>>> fs/nfs/Makefile | 1 +
>>>>> fs/nfs/blocklayout/Makefile | 5 +
>>>>> fs/nfs/blocklayout/blocklayout.c | 175
>>>> ++++++++++++++++++++++++++++++++++++++
>>>>> fs/nfs/blocklayout/blocklayout.h | 91 ++++++++++++++++++++
>>>>> 5 files changed, 279 insertions(+), 1 deletions(-)
>>>>> create mode 100644 fs/nfs/blocklayout/Makefile
>>>>> create mode 100644 fs/nfs/blocklayout/blocklayout.c
>>>>> create mode 100644 fs/nfs/blocklayout/blocklayout.h
>>>>>
>>>>> diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
>>>>> index 2cde5d9..be02077 100644
>>>>> --- a/fs/nfs/Kconfig
>>>>> +++ b/fs/nfs/Kconfig
>>>>> @@ -79,15 +79,21 @@ config NFS_V4_1
>>>>> depends on NFS_FS && NFS_V4 && EXPERIMENTAL
>>>>> select SUNRPC_BACKCHANNEL
>>>>> select PNFS_FILE_LAYOUT
>>>>> + select PNFS_BLOCK
>>>>> + select MD
>>>>> + select BLK_DEV_DM
>>>>
>>>> Why is PNFS_BLOCK enabled automatically in all cases?
>>>> That renders the use of modules for layout drivers totally useless.
>>>> I sort of understand that for PNFS_FILE_LAYOUT (when my
>>>> arm is twisted really hard behind my back :) since it
>>>> is an integral part of RFC5661 but what's the justification
>>>> for PNFS_BLOCK? and why blocks and not objects?
>>>
>>> The question is rather why did objects add a selectable compile
>> option?
>>
>> Just good citizenship :)
>>
>>> What is the point of not compiling a given layout driver if all the
>>> dependencies are met?
>>
>> Reducing build times...
>> Building a smaller kernel when modules are disabled...
>
>
> You can add a line with
> depends on m
>
> to ensure that it is always compiled as a module. I think that might be
> a good thing until we have nailed down all the issues with pNFS.
>
I'd rather leave it as is so it's easier to test without CONFIG_MODULES.
>> We're fine in terms of memory consumption when CONFIG_MODULES=y since
>> the
>> layout driver is loaded on demand but shouldn't be worried about
>> the other case?
>>
>>>
>>> IOW: The only thing I'd change above is the select MD and select
>>> BLK_DEV_DM: I'd prefer something like
>>>
>>> config PNFS_BLOCK
>>> depends on NFS_V4_1 && MD && BLK_DEV_DM
>>> default y
>>
>> This is closer to the original version.
>> However, selecting MD and BLK_DEV_DM was proven useful to
> automatically
>> take
>> care of the module dependencies without having to dive into details.
>
> Yes, but since the MD is a completely different layer that is not under
> our control (well, OK, Neil is still an NFS maintainer and an MD
> maintainer) then I'd prefer to leave it as a dependency.
>
> We can always add something like
>
> comment
> depends on NFS_V4_1 && !BLK_DEV_DM
> Please enable BLK_DEV_MD if you wish to enable the pNFS block
> driver.
I never new you can enable comments conditionally this way...
It looks ok to me, I'll try it out and see how it shows in make *config
Benny
>
>
> Cheers
> Trond
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Fred Isaman <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
[pnfsblock: fix bug getting pnfs_layout_type in translate_devid().]
Signed-off-by: Tao Guo <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Zhang Jingwang <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 36 +++++++++++++++++++++++++++++-----
fs/nfs/blocklayout/blocklayout.h | 6 +++++
fs/nfs/blocklayout/blocklayoutdev.c | 8 +++++++
3 files changed, 44 insertions(+), 6 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 3afe363..1858743 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -115,16 +115,40 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
return &bl->bl_layout;
}
-static void
-bl_free_lseg(struct pnfs_layout_segment *lseg)
+static void bl_free_lseg(struct pnfs_layout_segment *lseg)
{
+ dprintk("%s enter\n", __func__);
+ kfree(lseg);
}
-static struct pnfs_layout_segment *
-bl_alloc_lseg(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
+/* We pretty much ignore lseg, and store all data layout wide, so we
+ * can correctly merge.
+ */
+static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
{
- return NULL;
+ struct pnfs_layout_segment *lseg;
+ int status;
+
+ dprintk("%s enter\n", __func__);
+ lseg = kzalloc(sizeof(*lseg), gfp_flags);
+ if (!lseg)
+ return ERR_PTR(-ENOMEM);
+ status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
+ if (status) {
+ /* We don't want to call the full-blown bl_free_lseg,
+ * since on error extents were not touched.
+ */
+ /* STUB - we really want to distinguish between 2 error
+ * conditions here. This lseg failed, but lo data structures
+ * are OK, or we hosed the lo data structures. The calling
+ * code probably needs to distinguish this too.
+ */
+ kfree(lseg);
+ return ERR_PTR(status);
+ }
+ return lseg;
}
static void
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 08b4d6f..7dab978 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -96,6 +96,12 @@ static inline struct pnfs_block_layout *BLK_LO2EXT(struct pnfs_layout_hdr *lo)
return container_of(lo, struct pnfs_block_layout, bl_layout);
}
+static inline struct pnfs_block_layout *
+BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
+{
+ return BLK_LO2EXT(lseg->pls_layout);
+}
+
struct bl_dev_msg {
int status;
uint32_t major, minor;
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index 7e1377f..64da33a 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -189,3 +189,11 @@ out:
kfree(msg.data);
return rv;
}
+
+int
+nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
+{
+ /* STUB */
+ return -EIO;
+}
--
1.7.4.1
On 2011-07-21 15:34, Jim Rees wrote:
> From: Fred Isaman <[email protected]>
>
> Signed-off-by: Fred Isaman <[email protected]>
> [pnfsblock: fix bug getting pnfs_layout_type in translate_devid().]
> Signed-off-by: Tao Guo <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> Signed-off-by: Zhang Jingwang <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/blocklayout/blocklayout.c | 36 +++++++++++++++++++++++++++++-----
> fs/nfs/blocklayout/blocklayout.h | 6 +++++
> fs/nfs/blocklayout/blocklayoutdev.c | 8 +++++++
> 3 files changed, 44 insertions(+), 6 deletions(-)
>
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 3afe363..1858743 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -115,16 +115,40 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> return &bl->bl_layout;
> }
>
> -static void
> -bl_free_lseg(struct pnfs_layout_segment *lseg)
> +static void bl_free_lseg(struct pnfs_layout_segment *lseg)
> {
> + dprintk("%s enter\n", __func__);
> + kfree(lseg);
> }
>
> -static struct pnfs_layout_segment *
> -bl_alloc_lseg(struct pnfs_layout_hdr *lo,
> - struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
> +/* We pretty much ignore lseg, and store all data layout wide, so we
> + * can correctly merge.
> + */
> +static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
> + struct nfs4_layoutget_res *lgr,
> + gfp_t gfp_flags)
> {
> - return NULL;
> + struct pnfs_layout_segment *lseg;
> + int status;
> +
> + dprintk("%s enter\n", __func__);
> + lseg = kzalloc(sizeof(*lseg), gfp_flags);
> + if (!lseg)
> + return ERR_PTR(-ENOMEM);
> + status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
> + if (status) {
> + /* We don't want to call the full-blown bl_free_lseg,
> + * since on error extents were not touched.
> + */
> + /* STUB - we really want to distinguish between 2 error
please use "FIXME:" or "TODO:" to label further work that needs to be done.
The latter part of the second comment actually contradicts the first one
if I understand it correctly. Is the code exposed to this error case where the lo
data structure is hosed? When does it happen, what is the risk?
Benny
> + * conditions here. This lseg failed, but lo data structures
> + * are OK, or we hosed the lo data structures. The calling
> + * code probably needs to distinguish this too.
> + */
> + kfree(lseg);
> + return ERR_PTR(status);
> + }
> + return lseg;
> }
>
> static void
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index 08b4d6f..7dab978 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -96,6 +96,12 @@ static inline struct pnfs_block_layout *BLK_LO2EXT(struct pnfs_layout_hdr *lo)
> return container_of(lo, struct pnfs_block_layout, bl_layout);
> }
>
> +static inline struct pnfs_block_layout *
> +BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
> +{
> + return BLK_LO2EXT(lseg->pls_layout);
> +}
> +
> struct bl_dev_msg {
> int status;
> uint32_t major, minor;
> diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
> index 7e1377f..64da33a 100644
> --- a/fs/nfs/blocklayout/blocklayoutdev.c
> +++ b/fs/nfs/blocklayout/blocklayoutdev.c
> @@ -189,3 +189,11 @@ out:
> kfree(msg.data);
> return rv;
> }
> +
> +int
> +nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
> + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
> +{
> + /* STUB */
> + return -EIO;
> +}
From: Fred Isaman <[email protected]>
Note: When upper layer's read/write request cannot be fulfilled, the block
layout driver shouldn't silently mark the page as error. It should do
what can be done and leave the rest to the upper layer. To do so, we
should set rdata/wdata->res.count properly.
When upper layer re-send the read/write request to finish the rest
part of the request, pgbase is the position where we should start at.
[pnfsblock: mark IO error with NFS_LAYOUT_{RW|RO}_FAILED]
Signed-off-by: Peng Tao <[email protected]>
[pnfsblock: read path error handling]
Signed-off-by: Fred Isaman <[email protected]>
[pnfsblock: handle errors when read or write pagelist.]
Signed-off-by: Zhang Jingwang <[email protected]>
[pnfs-block: use new read_pagelist api]
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 282 ++++++++++++++++++++++++++++++++++++++
1 files changed, 282 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 300a678..764096c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -29,10 +29,12 @@
* of the software, even if it has been or is hereafter advised of the
* possibility of such damages.
*/
+
#include <linux/module.h>
#include <linux/init.h>
#include <linux/mount.h>
#include <linux/namei.h>
+#include <linux/bio.h> /* struct bio */
#include "blocklayout.h"
@@ -45,9 +47,289 @@ MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
struct dentry *bl_device_pipe;
wait_queue_head_t bl_wq;
+static void print_page(struct page *page)
+{
+ dprintk("PRINTPAGE page %p\n", page);
+ dprintk(" PagePrivate %d\n", PagePrivate(page));
+ dprintk(" PageUptodate %d\n", PageUptodate(page));
+ dprintk(" PageError %d\n", PageError(page));
+ dprintk(" PageDirty %d\n", PageDirty(page));
+ dprintk(" PageReferenced %d\n", PageReferenced(page));
+ dprintk(" PageLocked %d\n", PageLocked(page));
+ dprintk(" PageWriteback %d\n", PageWriteback(page));
+ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
+ dprintk("\n");
+}
+
+/* Given the be associated with isect, determine if page data needs to be
+ * initialized.
+ */
+static int is_hole(struct pnfs_block_extent *be, sector_t isect)
+{
+ if (be->be_state == PNFS_BLOCK_NONE_DATA)
+ return 1;
+ else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
+ return 0;
+ else
+ return !is_sector_initialized(be->be_inval, isect);
+}
+
+static int
+dont_like_caller(struct nfs_page *req)
+{
+ if (atomic_read(&req->wb_complete)) {
+ /* Called by _multi */
+ return 1;
+ } else {
+ /* Called by _one */
+ return 0;
+ }
+}
+
+/* The data we are handed might be spread across several bios. We need
+ * to track when the last one is finished.
+ */
+struct parallel_io {
+ struct kref refcnt;
+ struct rpc_call_ops call_ops;
+ void (*pnfs_callback) (void *data);
+ void *data;
+};
+
+static inline struct parallel_io *alloc_parallel(void *data)
+{
+ struct parallel_io *rv;
+
+ rv = kmalloc(sizeof(*rv), GFP_NOFS);
+ if (rv) {
+ rv->data = data;
+ kref_init(&rv->refcnt);
+ }
+ return rv;
+}
+
+static inline void get_parallel(struct parallel_io *p)
+{
+ kref_get(&p->refcnt);
+}
+
+static void destroy_parallel(struct kref *kref)
+{
+ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
+
+ dprintk("%s enter\n", __func__);
+ p->pnfs_callback(p->data);
+ kfree(p);
+}
+
+static inline void put_parallel(struct parallel_io *p)
+{
+ kref_put(&p->refcnt, destroy_parallel);
+}
+
+static struct bio *
+bl_submit_bio(int rw, struct bio *bio)
+{
+ if (bio) {
+ get_parallel(bio->bi_private);
+ dprintk("%s submitting %s bio %u@%llu\n", __func__,
+ rw == READ ? "read" : "write",
+ bio->bi_size, (unsigned long long)bio->bi_sector);
+ submit_bio(rw, bio);
+ }
+ return NULL;
+}
+
+static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
+ struct pnfs_block_extent *be,
+ void (*end_io)(struct bio *, int err),
+ struct parallel_io *par)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_NOIO, npg);
+ if (!bio)
+ return NULL;
+
+ bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+ bio->bi_bdev = be->be_mdev;
+ bio->bi_end_io = end_io;
+ bio->bi_private = par;
+ return bio;
+}
+
+static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
+ sector_t isect, struct page *page,
+ struct pnfs_block_extent *be,
+ void (*end_io)(struct bio *, int err),
+ struct parallel_io *par)
+{
+retry:
+ if (!bio) {
+ bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+ if (!bio)
+ return ERR_PTR(-ENOMEM);
+ }
+ if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ bio = bl_submit_bio(rw, bio);
+ goto retry;
+ }
+ return bio;
+}
+
+static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+ if (lseg->pls_range.iomode == IOMODE_RW) {
+ dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+ set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+ } else {
+ dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+ set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+ }
+}
+
+/* This is basically copied from mpage_end_io_read */
+static void bl_end_io_read(struct bio *bio, int err)
+{
+ struct parallel_io *par = bio->bi_private;
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+ if (uptodate)
+ SetPageUptodate(page);
+ } while (bvec >= bio->bi_io_vec);
+ if (!uptodate) {
+ if (!rdata->pnfs_error)
+ rdata->pnfs_error = -EIO;
+ bl_set_lo_fail(rdata->lseg);
+ }
+ bio_put(bio);
+ put_parallel(par);
+}
+
+static void bl_read_cleanup(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_read_data *rdata;
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ rdata = container_of(task, struct nfs_read_data, task);
+ pnfs_ld_read_done(rdata);
+}
+
+static void
+bl_end_par_io_read(void *data)
+{
+ struct nfs_read_data *rdata = data;
+
+ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
+ schedule_work(&rdata->task.u.tk_work);
+}
+
+/* We don't want normal .rpc_call_done callback used, so we replace it
+ * with this stub.
+ */
+static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
+{
+ return;
+}
+
static enum pnfs_try_status
bl_read_pagelist(struct nfs_read_data *rdata)
{
+ int i, hole;
+ struct bio *bio = NULL;
+ struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+ sector_t isect, extent_length = 0;
+ struct parallel_io *par;
+ loff_t f_offset = rdata->args.offset;
+ size_t count = rdata->args.count;
+ struct page **pages = rdata->args.pages;
+ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+
+ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
+ rdata->npages, f_offset, count);
+
+ if (dont_like_caller(rdata->req)) {
+ dprintk("%s dont_like_caller failed\n", __func__);
+ goto use_mds;
+ }
+
+ par = alloc_parallel(rdata);
+ if (!par)
+ goto use_mds;
+ par->call_ops = *rdata->mds_ops;
+ par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+ par->pnfs_callback = bl_end_par_io_read;
+ /* At this point, we can no longer jump to use_mds */
+
+ isect = (sector_t) (f_offset >> SECTOR_SHIFT);
+ /* Code assumes extents are page-aligned */
+ for (i = pg_index; i < rdata->npages; i++) {
+ if (!extent_length) {
+ /* We've used up the previous extent */
+ bl_put_extent(be);
+ bl_put_extent(cow_read);
+ bio = bl_submit_bio(READ, bio);
+ /* Get the next one */
+ be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
+ isect, &cow_read);
+ if (!be) {
+ rdata->pnfs_error = -EIO;
+ goto out;
+ }
+ extent_length = be->be_length -
+ (isect - be->be_f_offset);
+ if (cow_read) {
+ sector_t cow_length = cow_read->be_length -
+ (isect - cow_read->be_f_offset);
+ extent_length = min(extent_length, cow_length);
+ }
+ }
+ hole = is_hole(be, isect);
+ if (hole && !cow_read) {
+ bio = bl_submit_bio(READ, bio);
+ /* Fill hole w/ zeroes w/o accessing device */
+ dprintk("%s Zeroing page for hole\n", __func__);
+ zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+ print_page(pages[i]);
+ SetPageUptodate(pages[i]);
+ } else {
+ struct pnfs_block_extent *be_read;
+
+ be_read = (hole && cow_read) ? cow_read : be;
+ bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
+ isect, pages[i], be_read,
+ bl_end_io_read, par);
+ if (IS_ERR(bio)) {
+ rdata->pnfs_error = PTR_ERR(bio);
+ goto out;
+ }
+ }
+ isect += PAGE_CACHE_SECTORS;
+ extent_length -= PAGE_CACHE_SECTORS;
+ }
+ if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
+ rdata->res.eof = 1;
+ rdata->res.count = rdata->inode->i_size - f_offset;
+ } else {
+ rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
+ }
+out:
+ bl_put_extent(be);
+ bl_put_extent(cow_read);
+ bl_submit_bio(READ, bio);
+ put_parallel(par);
+ return PNFS_ATTEMPTED;
+
+ use_mds:
+ dprintk("Giving up and using normal NFS\n");
return PNFS_NOT_ATTEMPTED;
}
--
1.7.4.1
From: Peng Tao <[email protected]>
For invalid extents, find other pages in the same fsblock and write them out.
[pnfsblock: write_begin]
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 275 ++++++++++++++++++++++++++++++++------
1 files changed, 233 insertions(+), 42 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 190b61f..f104a66 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,6 +35,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/bio.h> /* struct bio */
+#include <linux/buffer_head.h> /* various write calls */
#include "blocklayout.h"
@@ -79,12 +80,8 @@ static int is_hole(struct pnfs_block_extent *be, sector_t isect)
*/
static int is_writable(struct pnfs_block_extent *be, sector_t isect)
{
- if (be->be_state == PNFS_BLOCK_READWRITE_DATA)
- return 1;
- else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
- return 0;
- else
- return is_sector_initialized(be->be_inval, isect);
+ return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+ be->be_state == PNFS_BLOCK_INVALID_DATA);
}
static int
@@ -370,6 +367,31 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
}
}
+static void bl_end_io_write_zero(struct bio *bio, int err)
+{
+ struct parallel_io *par = bio->bi_private;
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+ /* This is the zeroing page we added */
+ end_page_writeback(page);
+ page_cache_release(page);
+ } while (bvec >= bio->bi_io_vec);
+ if (!uptodate) {
+ if (!wdata->pnfs_error)
+ wdata->pnfs_error = -EIO;
+ bl_set_lo_fail(wdata->lseg);
+ }
+ bio_put(bio);
+ put_parallel(par);
+}
+
/* This is basically copied from mpage_end_io_read */
static void bl_end_io_write(struct bio *bio, int err)
{
@@ -396,11 +418,8 @@ static void bl_write_cleanup(struct work_struct *work)
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
wdata = container_of(task, struct nfs_write_data, task);
- if (!wdata->task.tk_status) {
+ if (!wdata->pnfs_error) {
/* Marks for LAYOUTCOMMIT */
- /* BUG - this should be called after each bio, not after
- * all finish, unless have some way of storing success/failure
- */
mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
wdata->args.offset, wdata->args.count);
}
@@ -408,30 +427,105 @@ static void bl_write_cleanup(struct work_struct *work)
}
/* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void
-bl_end_par_io_write(void *data)
+static void bl_end_par_io_write(void *data)
{
struct nfs_write_data *wdata = data;
- /* STUB - ignoring error handling */
wdata->task.tk_status = 0;
wdata->verf.committed = NFS_FILE_SYNC;
INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
schedule_work(&wdata->task.u.tk_work);
}
+/* STUB - mark intersection of layout and page as bad, so is not
+ * used again.
+ */
+static void mark_bad_read(void)
+{
+ return;
+}
+
+/*
+ * map_block: map a requested I/0 block (isect) into an offset in the LVM
+ * block_device
+ */
+static void
+map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
+{
+ dprintk("%s enter be=%p\n", __func__, be);
+
+ set_buffer_mapped(bh);
+ bh->b_bdev = be->be_mdev;
+ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
+ (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
+
+ dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
+ __func__, (unsigned long long)isect, (long)bh->b_blocknr,
+ bh->b_size);
+ return;
+}
+
+/* Given an unmapped page, zero it or read in page for COW, page is locked
+ * by caller.
+ */
+static int
+init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
+{
+ struct buffer_head *bh = NULL;
+ int ret = 0;
+ sector_t isect;
+
+ dprintk("%s enter, %p\n", __func__, page);
+ BUG_ON(PageUptodate(page));
+ if (!cow_read) {
+ zero_user_segment(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ goto cleanup;
+ }
+
+ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
+ if (!bh) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+
+ isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
+ map_block(bh, isect, cow_read);
+ if (!bh_uptodate_or_lock(bh))
+ ret = bh_submit_read(bh);
+ if (ret)
+ goto cleanup;
+ SetPageUptodate(page);
+
+cleanup:
+ bl_put_extent(cow_read);
+ if (bh)
+ free_buffer_head(bh);
+ if (ret) {
+ /* Need to mark layout with bad read...should now
+ * just use nfs4 for reads and writes.
+ */
+ mark_bad_read();
+ }
+ return ret;
+}
+
static enum pnfs_try_status
bl_write_pagelist(struct nfs_write_data *wdata, int sync)
{
- int i;
+ int i, ret, npg_zero, pg_index, last = 0;
struct bio *bio = NULL;
- struct pnfs_block_extent *be = NULL;
- sector_t isect, extent_length = 0;
+ struct pnfs_block_extent *be = NULL, *cow_read = NULL;
+ sector_t isect, last_isect = 0, extent_length = 0;
struct parallel_io *par;
loff_t offset = wdata->args.offset;
size_t count = wdata->args.count;
struct page **pages = wdata->args.pages;
- int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
+ struct page *page;
+ pgoff_t index;
+ u64 temp;
+ int npg_per_block =
+ NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
if (!wdata->lseg) {
@@ -443,11 +537,8 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
return PNFS_NOT_ATTEMPTED;
}
/* At this point, wdata->pages is a (sequential) list of nfs_pages.
- * We want to write each, and if there is an error remove it from
- * list and call
- * nfs_retry_request(req) to have it redone using nfs.
- * QUEST? Do as block or per req? Think have to do per block
- * as part of end_bio
+ * We want to write each, and if there is an error set pnfs_error
+ * to have it redone using nfs.
*/
par = alloc_parallel(wdata);
if (!par)
@@ -458,7 +549,91 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
/* At this point, have to be more careful with error handling */
isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
- for (i = pg_index; i < wdata->npages ; i++) {
+ be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
+ if (!be || !is_writable(be, isect)) {
+ dprintk("%s no matching extents!\n", __func__);
+ wdata->pnfs_error = -EINVAL;
+ goto out;
+ }
+
+ /* First page inside INVALID extent */
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ temp = offset >> PAGE_CACHE_SHIFT;
+ npg_zero = do_div(temp, npg_per_block);
+ isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
+ (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
+ extent_length = be->be_length - (isect - be->be_f_offset);
+
+fill_invalid_ext:
+ dprintk("%s need to zero %d pages\n", __func__, npg_zero);
+ for (;npg_zero > 0; npg_zero--) {
+ /* page ref released in bl_end_io_write_zero */
+ index = isect >> PAGE_CACHE_SECTOR_SHIFT;
+ dprintk("%s zero %dth page: index %lu isect %llu\n",
+ __func__, npg_zero, index,
+ (unsigned long long)isect);
+ page =
+ find_or_create_page(wdata->inode->i_mapping, index,
+ GFP_NOFS);
+ if (!page) {
+ dprintk("%s oom\n", __func__);
+ wdata->pnfs_error = -ENOMEM;
+ goto out;
+ }
+
+ /* PageDirty: Other will write this out
+ * PageWriteback: Other is writing this out
+ * PageUptodate: It was read before
+ * sector_initialized: already written out
+ */
+ if (PageDirty(page) || PageWriteback(page) ||
+ is_sector_initialized(be->be_inval, isect)) {
+ print_page(page);
+ unlock_page(page);
+ page_cache_release(page);
+ goto next_page;
+ }
+ if (!PageUptodate(page)) {
+ /* New page, readin or zero it */
+ init_page_for_write(page, cow_read);
+ }
+ set_page_writeback(page);
+ unlock_page(page);
+
+ ret = bl_mark_sectors_init(be->be_inval, isect,
+ PAGE_CACHE_SECTORS,
+ NULL);
+ if (unlikely(ret)) {
+ dprintk("%s bl_mark_sectors_init fail %d\n",
+ __func__, ret);
+ end_page_writeback(page);
+ page_cache_release(page);
+ wdata->pnfs_error = ret;
+ goto out;
+ }
+ bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
+ isect, page, be,
+ bl_end_io_write_zero, par);
+ if (IS_ERR(bio)) {
+ wdata->pnfs_error = PTR_ERR(bio);
+ goto out;
+ }
+ /* FIXME: This should be done in bi_end_io */
+ mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+ page->index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE);
+next_page:
+ isect += PAGE_CACHE_SECTORS;
+ extent_length -= PAGE_CACHE_SECTORS;
+ }
+ if (last)
+ goto write_done;
+ }
+ bio = bl_submit_bio(WRITE, bio);
+
+ /* Middle pages */
+ pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
+ for (i = pg_index; i < wdata->npages; i++) {
if (!extent_length) {
/* We've used up the previous extent */
bl_put_extent(be);
@@ -467,35 +642,51 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
isect, NULL);
if (!be || !is_writable(be, isect)) {
- wdata->pnfs_error = -ENOMEM;
+ wdata->pnfs_error = -EINVAL;
goto out;
}
extent_length = be->be_length -
- (isect - be->be_f_offset);
+ (isect - be->be_f_offset);
}
- for (;;) {
- if (!bio) {
- bio = bio_alloc(GFP_NOIO, wdata->npages - i);
- if (!bio) {
- wdata->pnfs_error = -ENOMEM;
- goto out;
- }
- bio->bi_sector = isect - be->be_f_offset +
- be->be_v_offset;
- bio->bi_bdev = be->be_mdev;
- bio->bi_end_io = bl_end_io_write;
- bio->bi_private = par;
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ ret = bl_mark_sectors_init(be->be_inval, isect,
+ PAGE_CACHE_SECTORS,
+ NULL);
+ if (unlikely(ret)) {
+ dprintk("%s bl_mark_sectors_init fail %d\n",
+ __func__, ret);
+ wdata->pnfs_error = ret;
+ goto out;
}
- if (bio_add_page(bio, pages[i], PAGE_SIZE, 0))
- break;
- bio = bl_submit_bio(WRITE, bio);
+ }
+ bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
+ isect, pages[i], be,
+ bl_end_io_write, par);
+ if (IS_ERR(bio)) {
+ wdata->pnfs_error = PTR_ERR(bio);
+ goto out;
}
isect += PAGE_CACHE_SECTORS;
+ last_isect = isect;
extent_length -= PAGE_CACHE_SECTORS;
}
- wdata->res.count = (isect << SECTOR_SHIFT) - (offset);
- if (count < wdata->res.count)
+
+ /* Last page inside INVALID extent */
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ bio = bl_submit_bio(WRITE, bio);
+ temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
+ npg_zero = npg_per_block - do_div(temp, npg_per_block);
+ if (npg_zero < npg_per_block) {
+ last = 1;
+ goto fill_invalid_ext;
+ }
+ }
+
+write_done:
+ wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
+ if (count < wdata->res.count) {
wdata->res.count = count;
+ }
out:
bl_put_extent(be);
bl_submit_bio(WRITE, bio);
--
1.7.4.1
On 2011-07-25 10:38, Myklebust, Trond wrote:
>> -----Original Message-----
>> From: Benny Halevy [mailto:[email protected]]
>> Sent: Monday, July 25, 2011 10:31 AM
>> To: Jim Rees
>> Cc: Myklebust, Trond; [email protected]; peter honeyman
>> Subject: Re: [PATCH v2 07/25] pnfsblock: add blocklayout Kconfig
>> option, Makefile, and stubs
>>
>> On 2011-07-21 15:34, Jim Rees wrote:
>>> From: Fred Isaman <[email protected]>
>>>
>>> Define a configuration variable to enable/disable compilation of the
>>> block driver code.
>>>
>>> Add the minimal structure for a pnfs block layout driver, and empty
>>> list-heads that will hold the extent data
>>>
>>> [pnfsblock: make NFS_V4_1 select PNFS_BLOCK]
>>> Signed-off-by: Peng Tao <[email protected]>
>>> Signed-off-by: Fred Isaman <[email protected]>
>>> Signed-off-by: Benny Halevy <[email protected]>
>>> [pnfs-block: fix CONFIG_PNFS_BLOCK dependencies]
>>> Signed-off-by: Benny Halevy <[email protected]>
>>> Signed-off-by: Benny Halevy <[email protected]>
>>> [pnfsblock: SQUASHME: port block layout code]
>>> Signed-off-by: Peng Tao <[email protected]>
>>> [pnfsblock: SQUASHME: adjust to API change]
>>> Signed-off-by: Fred Isaman <[email protected]>
>>> [pnfs: move pnfs_layout_type inline in nfs_inode]
>>> Signed-off-by: Benny Halevy <[email protected]>
>>> [blocklayout: encode_layoutcommit implementation]
>>> Signed-off-by: Boaz Harrosh <[email protected]>
>>> Signed-off-by: Benny Halevy <[email protected]>
>>> Signed-off-by: Benny Halevy <[email protected]>
>>> [pnfsblock: layout alloc and free]
>>> Signed-off-by: Fred Isaman <[email protected]>
>>> [pnfs: move pnfs_layout_type inline in nfs_inode]
>>> Signed-off-by: Benny Halevy <[email protected]>
>>> Signed-off-by: Benny Halevy <[email protected]>
>>> [pnfsblock: define module alias]
>>> Signed-off-by: Peng Tao <[email protected]>
>>> ---
>>> fs/nfs/Kconfig | 8 ++-
>>> fs/nfs/Makefile | 1 +
>>> fs/nfs/blocklayout/Makefile | 5 +
>>> fs/nfs/blocklayout/blocklayout.c | 175
>> ++++++++++++++++++++++++++++++++++++++
>>> fs/nfs/blocklayout/blocklayout.h | 91 ++++++++++++++++++++
>>> 5 files changed, 279 insertions(+), 1 deletions(-)
>>> create mode 100644 fs/nfs/blocklayout/Makefile
>>> create mode 100644 fs/nfs/blocklayout/blocklayout.c
>>> create mode 100644 fs/nfs/blocklayout/blocklayout.h
>>>
>>> diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
>>> index 2cde5d9..be02077 100644
>>> --- a/fs/nfs/Kconfig
>>> +++ b/fs/nfs/Kconfig
>>> @@ -79,15 +79,21 @@ config NFS_V4_1
>>> depends on NFS_FS && NFS_V4 && EXPERIMENTAL
>>> select SUNRPC_BACKCHANNEL
>>> select PNFS_FILE_LAYOUT
>>> + select PNFS_BLOCK
>>> + select MD
>>> + select BLK_DEV_DM
>>
>> Why is PNFS_BLOCK enabled automatically in all cases?
>> That renders the use of modules for layout drivers totally useless.
>> I sort of understand that for PNFS_FILE_LAYOUT (when my
>> arm is twisted really hard behind my back :) since it
>> is an integral part of RFC5661 but what's the justification
>> for PNFS_BLOCK? and why blocks and not objects?
>
> The question is rather why did objects add a selectable compile option?
Just good citizenship :)
> What is the point of not compiling a given layout driver if all the
> dependencies are met?
Reducing build times...
Building a smaller kernel when modules are disabled...
We're fine in terms of memory consumption when CONFIG_MODULES=y since the
layout driver is loaded on demand but shouldn't be worried about
the other case?
>
> IOW: The only thing I'd change above is the select MD and select
> BLK_DEV_DM: I'd prefer something like
>
> config PNFS_BLOCK
> depends on NFS_V4_1 && MD && BLK_DEV_DM
> default y
This is closer to the original version.
However, selecting MD and BLK_DEV_DM was proven useful to automatically take
care of the module dependencies without having to dive into details.
Benny
>
> Trond
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Peng Tao <[email protected]>
There is no need to keep lseg reference when read/write through MDS.
This fixes a null pointer crash at nfs_post_op_update_inode_force_wcc
because nfs4_proc_write_setup will unset wdata->res.fattr if wdata->lseg
is not NULL.
Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/pnfs.c | 6 ++++++
1 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3b20753..fda3019 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1182,6 +1182,9 @@ pnfs_ld_write_done(struct nfs_write_data *data)
dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
data->pnfs_error);
+
+ put_lseg(data->lseg);
+ data->lseg = NULL;
status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
data->mds_ops, NFS_FILE_SYNC);
return status ? : -EAGAIN;
@@ -1282,6 +1285,9 @@ pnfs_ld_read_done(struct nfs_read_data *data)
dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
data->pnfs_error);
+
+ put_lseg(data->lseg);
+ data->lseg = NULL;
status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
data->mds_ops);
return status ? : -EAGAIN;
--
1.7.4.1
From: Fred Isaman <[email protected]>
Note: When upper layer's read/write request cannot be fulfilled, the block
layout driver shouldn't silently mark the page as error. It should do
what can be done and leave the rest to the upper layer. To do so, we
should set rdata/wdata->res.count properly.
When upper layer re-send the read/write request to finish the rest
part of the request, pgbase is the position where we should start at.
[pnfsblock: bl_write_pagelist support functions]
[pnfsblock: bl_write_pagelist adjust for missing PG_USE_PNFS]
Signed-off-by: Fred Isaman <[email protected]>
[pnfsblock: handle errors when read or write pagelist.]
Signed-off-by: Zhang Jingwang <[email protected]>
[pnfs-block: use new write_pagelist api]
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[SQUASHME: pnfsblock: mds_offset is set in the generic layer]
Signed-off-by: Boaz Harrosh <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[pnfsblock: mark IO error with NFS_LAYOUT_{RW|RO}_FAILED]
Signed-off-by: Peng Tao <[email protected]>
[pnfsblock: SQUASHME: adjust to API change]
Signed-off-by: Fred Isaman <[email protected]>
[pnfsblock: fixup blksize alignment in bl_setup_layoutcommit]
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[pnfsblock: bl_write_pagelist adjust for missing PG_USE_PNFS]
Signed-off-by: Fred Isaman <[email protected]>
[pnfsblock: handle errors when read or write pagelist.]
Signed-off-by: Zhang Jingwang <[email protected]>
[pnfs-block: use new write_pagelist api]
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 137 +++++++++++++++++++++++++++++++++++++-
1 files changed, 134 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 764096c..6d6ac0e 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -74,6 +74,19 @@ static int is_hole(struct pnfs_block_extent *be, sector_t isect)
return !is_sector_initialized(be->be_inval, isect);
}
+/* Given the be associated with isect, determine if page data can be
+ * written to disk.
+ */
+static int is_writable(struct pnfs_block_extent *be, sector_t isect)
+{
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA)
+ return 1;
+ else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
+ return 0;
+ else
+ return is_sector_initialized(be->be_inval, isect);
+}
+
static int
dont_like_caller(struct nfs_page *req)
{
@@ -333,11 +346,129 @@ out:
return PNFS_NOT_ATTEMPTED;
}
+/* This is basically copied from mpage_end_io_read */
+static void bl_end_io_write(struct bio *bio, int err)
+{
+ struct parallel_io *par = bio->bi_private;
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
+
+ if (!uptodate) {
+ if (!wdata->pnfs_error)
+ wdata->pnfs_error = -EIO;
+ bl_set_lo_fail(wdata->lseg);
+ }
+ bio_put(bio);
+ put_parallel(par);
+}
+
+/* Function scheduled for call during bl_end_par_io_write,
+ * it marks sectors as written and extends the commitlist.
+ */
+static void bl_write_cleanup(struct work_struct *work)
+{
+ struct rpc_task *task;
+ struct nfs_write_data *wdata;
+ dprintk("%s enter\n", __func__);
+ task = container_of(work, struct rpc_task, u.tk_work);
+ wdata = container_of(task, struct nfs_write_data, task);
+ pnfs_ld_write_done(wdata);
+}
+
+/* Called when last of bios associated with a bl_write_pagelist call finishes */
+static void
+bl_end_par_io_write(void *data)
+{
+ struct nfs_write_data *wdata = data;
+
+ /* STUB - ignoring error handling */
+ wdata->task.tk_status = 0;
+ wdata->verf.committed = NFS_FILE_SYNC;
+ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
+ schedule_work(&wdata->task.u.tk_work);
+}
+
static enum pnfs_try_status
-bl_write_pagelist(struct nfs_write_data *wdata,
- int sync)
+bl_write_pagelist(struct nfs_write_data *wdata, int sync)
{
- return PNFS_NOT_ATTEMPTED;
+ int i;
+ struct bio *bio = NULL;
+ struct pnfs_block_extent *be = NULL;
+ sector_t isect, extent_length = 0;
+ struct parallel_io *par;
+ loff_t offset = wdata->args.offset;
+ size_t count = wdata->args.count;
+ struct page **pages = wdata->args.pages;
+ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
+
+ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
+ if (!wdata->lseg) {
+ dprintk("%s no lseg, falling back to MDS\n", __func__);
+ return PNFS_NOT_ATTEMPTED;
+ }
+ if (dont_like_caller(wdata->req)) {
+ dprintk("%s dont_like_caller failed\n", __func__);
+ return PNFS_NOT_ATTEMPTED;
+ }
+ /* At this point, wdata->pages is a (sequential) list of nfs_pages.
+ * We want to write each, and if there is an error remove it from
+ * list and call
+ * nfs_retry_request(req) to have it redone using nfs.
+ * QUEST? Do as block or per req? Think have to do per block
+ * as part of end_bio
+ */
+ par = alloc_parallel(wdata);
+ if (!par)
+ return PNFS_NOT_ATTEMPTED;
+ par->call_ops = *wdata->mds_ops;
+ par->call_ops.rpc_call_done = bl_rpc_do_nothing;
+ par->pnfs_callback = bl_end_par_io_write;
+ /* At this point, have to be more careful with error handling */
+
+ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
+ for (i = pg_index; i < wdata->npages ; i++) {
+ if (!extent_length) {
+ /* We've used up the previous extent */
+ bl_put_extent(be);
+ bio = bl_submit_bio(WRITE, bio);
+ /* Get the next one */
+ be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
+ isect, NULL);
+ if (!be || !is_writable(be, isect)) {
+ wdata->pnfs_error = -ENOMEM;
+ goto out;
+ }
+ extent_length = be->be_length -
+ (isect - be->be_f_offset);
+ }
+ for (;;) {
+ if (!bio) {
+ bio = bio_alloc(GFP_NOIO, wdata->npages - i);
+ if (!bio) {
+ wdata->pnfs_error = -ENOMEM;
+ goto out;
+ }
+ bio->bi_sector = isect - be->be_f_offset +
+ be->be_v_offset;
+ bio->bi_bdev = be->be_mdev;
+ bio->bi_end_io = bl_end_io_write;
+ bio->bi_private = par;
+ }
+ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0))
+ break;
+ bio = bl_submit_bio(WRITE, bio);
+ }
+ isect += PAGE_CACHE_SECTORS;
+ extent_length -= PAGE_CACHE_SECTORS;
+ }
+ wdata->res.count = (isect << SECTOR_SHIFT) - (offset);
+ if (count < wdata->res.count)
+ wdata->res.count = count;
+out:
+ bl_put_extent(be);
+ bl_submit_bio(WRITE, bio);
+ put_parallel(par);
+ return PNFS_ATTEMPTED;
}
/* FIXME - range ignored */
--
1.7.4.1
On 2011-07-21 15:34, Jim Rees wrote:
> From: Fred Isaman <[email protected]>
>
> Define a configuration variable to enable/disable compilation of the
> block driver code.
>
> Add the minimal structure for a pnfs block layout driver, and empty
> list-heads that will hold the extent data
>
> [pnfsblock: make NFS_V4_1 select PNFS_BLOCK]
> Signed-off-by: Peng Tao <[email protected]>
> Signed-off-by: Fred Isaman <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> [pnfs-block: fix CONFIG_PNFS_BLOCK dependencies]
> Signed-off-by: Benny Halevy <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> [pnfsblock: SQUASHME: port block layout code]
> Signed-off-by: Peng Tao <[email protected]>
> [pnfsblock: SQUASHME: adjust to API change]
> Signed-off-by: Fred Isaman <[email protected]>
> [pnfs: move pnfs_layout_type inline in nfs_inode]
> Signed-off-by: Benny Halevy <[email protected]>
> [blocklayout: encode_layoutcommit implementation]
> Signed-off-by: Boaz Harrosh <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> [pnfsblock: layout alloc and free]
> Signed-off-by: Fred Isaman <[email protected]>
> [pnfs: move pnfs_layout_type inline in nfs_inode]
> Signed-off-by: Benny Halevy <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> [pnfsblock: define module alias]
> Signed-off-by: Peng Tao <[email protected]>
> ---
> fs/nfs/Kconfig | 8 ++-
> fs/nfs/Makefile | 1 +
> fs/nfs/blocklayout/Makefile | 5 +
> fs/nfs/blocklayout/blocklayout.c | 175 ++++++++++++++++++++++++++++++++++++++
> fs/nfs/blocklayout/blocklayout.h | 91 ++++++++++++++++++++
> 5 files changed, 279 insertions(+), 1 deletions(-)
> create mode 100644 fs/nfs/blocklayout/Makefile
> create mode 100644 fs/nfs/blocklayout/blocklayout.c
> create mode 100644 fs/nfs/blocklayout/blocklayout.h
>
> diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
> index 2cde5d9..be02077 100644
> --- a/fs/nfs/Kconfig
> +++ b/fs/nfs/Kconfig
> @@ -79,15 +79,21 @@ config NFS_V4_1
> depends on NFS_FS && NFS_V4 && EXPERIMENTAL
> select SUNRPC_BACKCHANNEL
> select PNFS_FILE_LAYOUT
> + select PNFS_BLOCK
> + select MD
> + select BLK_DEV_DM
Why is PNFS_BLOCK enabled automatically in all cases?
That renders the use of modules for layout drivers totally useless.
I sort of understand that for PNFS_FILE_LAYOUT (when my
arm is twisted really hard behind my back :) since it
is an integral part of RFC5661 but what's the justification
for PNFS_BLOCK? and why blocks and not objects?
Benny
> help
> This option enables support for minor version 1 of the NFSv4 protocol
> - (RFC 5661) in the kernel's NFS client.
> + (RFC 5661 and RFC 5663) in the kernel's NFS client.
>
> If unsure, say N.
>
> config PNFS_FILE_LAYOUT
> tristate
>
> +config PNFS_BLOCK
> + tristate
> +
> config PNFS_OBJLAYOUT
> tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
> depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
> diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
> index 6a34f7d..b58613d 100644
> --- a/fs/nfs/Makefile
> +++ b/fs/nfs/Makefile
> @@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
> nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
>
> obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
> +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
> diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
> new file mode 100644
> index 0000000..6bf49cd
> --- /dev/null
> +++ b/fs/nfs/blocklayout/Makefile
> @@ -0,0 +1,5 @@
> +#
> +# Makefile for the pNFS block layout driver kernel module
> +#
> +obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
> +blocklayoutdriver-objs := blocklayout.o
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> new file mode 100644
> index 0000000..55a2a95
> --- /dev/null
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -0,0 +1,175 @@
> +/*
> + * linux/fs/nfs/blocklayout/blocklayout.c
> + *
> + * Module for the NFSv4.1 pNFS block layout driver.
> + *
> + * Copyright (c) 2006 The Regents of the University of Michigan.
> + * All rights reserved.
> + *
> + * Andy Adamson <[email protected]>
> + * Fred Isaman <[email protected]>
> + *
> + * permission is granted to use, copy, create derivative works and
> + * redistribute this software and such derivative works for any purpose,
> + * so long as the name of the university of michigan is not used in
> + * any advertising or publicity pertaining to the use or distribution
> + * of this software without specific, written prior authorization. if
> + * the above copyright notice or any other identification of the
> + * university of michigan is included in any copy of any portion of
> + * this software, then the disclaimer below must also be included.
> + *
> + * this software is provided as is, without representation from the
> + * university of michigan as to its fitness for any purpose, and without
> + * warranty by the university of michigan of any kind, either express
> + * or implied, including without limitation the implied warranties of
> + * merchantability and fitness for a particular purpose. the regents
> + * of the university of michigan shall not be liable for any damages,
> + * including special, indirect, incidental, or consequential damages,
> + * with respect to any claim arising out or in connection with the use
> + * of the software, even if it has been or is hereafter advised of the
> + * possibility of such damages.
> + */
> +#include <linux/module.h>
> +#include <linux/init.h>
> +
> +#include "blocklayout.h"
> +
> +#define NFSDBG_FACILITY NFSDBG_PNFS_LD
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Andy Adamson <[email protected]>");
> +MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
> +
> +static enum pnfs_try_status
> +bl_read_pagelist(struct nfs_read_data *rdata)
> +{
> + return PNFS_NOT_ATTEMPTED;
> +}
> +
> +static enum pnfs_try_status
> +bl_write_pagelist(struct nfs_write_data *wdata,
> + int sync)
> +{
> + return PNFS_NOT_ATTEMPTED;
> +}
> +
> +/* STUB */
> +static void
> +release_extents(struct pnfs_block_layout *bl,
> + struct pnfs_layout_range *range)
> +{
> + return;
> +}
> +
> +/* STUB */
> +static void
> +release_inval_marks(struct pnfs_inval_markings *marks)
> +{
> + return;
> +}
> +
> +static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
> +{
> + struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
> +
> + dprintk("%s enter\n", __func__);
> + release_extents(bl, NULL);
> + release_inval_marks(&bl->bl_inval);
> + kfree(bl);
> +}
> +
> +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
> + gfp_t gfp_flags)
> +{
> + struct pnfs_block_layout *bl;
> +
> + dprintk("%s enter\n", __func__);
> + bl = kzalloc(sizeof(*bl), gfp_flags);
> + if (!bl)
> + return NULL;
> + spin_lock_init(&bl->bl_ext_lock);
> + INIT_LIST_HEAD(&bl->bl_extents[0]);
> + INIT_LIST_HEAD(&bl->bl_extents[1]);
> + INIT_LIST_HEAD(&bl->bl_commit);
> + INIT_LIST_HEAD(&bl->bl_committing);
> + bl->bl_count = 0;
> + bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
> + INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
> + return &bl->bl_layout;
> +}
> +
> +static void
> +bl_free_lseg(struct pnfs_layout_segment *lseg)
> +{
> +}
> +
> +static struct pnfs_layout_segment *
> +bl_alloc_lseg(struct pnfs_layout_hdr *lo,
> + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
> +{
> + return NULL;
> +}
> +
> +static void
> +bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
> + const struct nfs4_layoutcommit_args *arg)
> +{
> +}
> +
> +static void
> +bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo,
> + struct nfs4_layoutcommit_data *lcdata)
> +{
> +}
> +
> +static int
> +bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
> +{
> + dprintk("%s enter\n", __func__);
> + return 0;
> +}
> +
> +static int
> +bl_clear_layoutdriver(struct nfs_server *server)
> +{
> + dprintk("%s enter\n", __func__);
> + return 0;
> +}
> +
> +static struct pnfs_layoutdriver_type blocklayout_type = {
> + .id = LAYOUT_BLOCK_VOLUME,
> + .name = "LAYOUT_BLOCK_VOLUME",
> + .read_pagelist = bl_read_pagelist,
> + .write_pagelist = bl_write_pagelist,
> + .alloc_layout_hdr = bl_alloc_layout_hdr,
> + .free_layout_hdr = bl_free_layout_hdr,
> + .alloc_lseg = bl_alloc_lseg,
> + .free_lseg = bl_free_lseg,
> + .encode_layoutcommit = bl_encode_layoutcommit,
> + .cleanup_layoutcommit = bl_cleanup_layoutcommit,
> + .set_layoutdriver = bl_set_layoutdriver,
> + .clear_layoutdriver = bl_clear_layoutdriver,
> +};
> +
> +static int __init nfs4blocklayout_init(void)
> +{
> + int ret;
> +
> + dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
> +
> + ret = pnfs_register_layoutdriver(&blocklayout_type);
> + return ret;
> +}
> +
> +static void __exit nfs4blocklayout_exit(void)
> +{
> + dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
> + __func__);
> +
> + pnfs_unregister_layoutdriver(&blocklayout_type);
> +}
> +
> +MODULE_ALIAS("nfs-layouttype4-3");
> +
> +module_init(nfs4blocklayout_init);
> +module_exit(nfs4blocklayout_exit);
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> new file mode 100644
> index 0000000..bda87e0
> --- /dev/null
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -0,0 +1,91 @@
> +/*
> + * linux/fs/nfs/blocklayout/blocklayout.h
> + *
> + * Module for the NFSv4.1 pNFS block layout driver.
> + *
> + * Copyright (c) 2006 The Regents of the University of Michigan.
> + * All rights reserved.
> + *
> + * Andy Adamson <[email protected]>
> + * Fred Isaman <[email protected]>
> + *
> + * permission is granted to use, copy, create derivative works and
> + * redistribute this software and such derivative works for any purpose,
> + * so long as the name of the university of michigan is not used in
> + * any advertising or publicity pertaining to the use or distribution
> + * of this software without specific, written prior authorization. if
> + * the above copyright notice or any other identification of the
> + * university of michigan is included in any copy of any portion of
> + * this software, then the disclaimer below must also be included.
> + *
> + * this software is provided as is, without representation from the
> + * university of michigan as to its fitness for any purpose, and without
> + * warranty by the university of michigan of any kind, either express
> + * or implied, including without limitation the implied warranties of
> + * merchantability and fitness for a particular purpose. the regents
> + * of the university of michigan shall not be liable for any damages,
> + * including special, indirect, incidental, or consequential damages,
> + * with respect to any claim arising out or in connection with the use
> + * of the software, even if it has been or is hereafter advised of the
> + * possibility of such damages.
> + */
> +#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
> +#define FS_NFS_NFS4BLOCKLAYOUT_H
> +
> +#include <linux/device-mapper.h>
> +#include <linux/nfs_fs.h>
> +#include "../pnfs.h"
> +
> +enum exstate4 {
> + PNFS_BLOCK_READWRITE_DATA = 0,
> + PNFS_BLOCK_READ_DATA = 1,
> + PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
> + PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
> +};
> +
> +struct pnfs_inval_markings {
> + /* STUB */
> +};
> +
> +/* sector_t fields are all in 512-byte sectors */
> +struct pnfs_block_extent {
> + struct kref be_refcnt;
> + struct list_head be_node; /* link into lseg list */
> + struct nfs4_deviceid be_devid; /* STUB - removable??? */
> + struct block_device *be_mdev;
> + sector_t be_f_offset; /* the starting offset in the file */
> + sector_t be_length; /* the size of the extent */
> + sector_t be_v_offset; /* the starting offset in the volume */
> + enum exstate4 be_state; /* the state of this extent */
> + struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */
> +};
> +
> +static inline void
> +INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
> +{
> + /* STUB */
> +}
> +
> +enum extentclass4 {
> + RW_EXTENT = 0, /* READWRTE and INVAL */
> + RO_EXTENT = 1, /* READ and NONE */
> + EXTENT_LISTS = 2,
> +};
> +
> +struct pnfs_block_layout {
> + struct pnfs_layout_hdr bl_layout;
> + struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */
> + spinlock_t bl_ext_lock; /* Protects list manipulation */
> + struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
> + struct list_head bl_commit; /* Needs layout commit */
> + struct list_head bl_committing; /* Layout committing */
> + unsigned int bl_count; /* entries in bl_commit */
> + sector_t bl_blocksize; /* Server blocksize in sectors */
> +};
> +
> +static inline struct pnfs_block_layout *BLK_LO2EXT(struct pnfs_layout_hdr *lo)
> +{
> + return container_of(lo, struct pnfs_block_layout, bl_layout);
> +}
> +
> +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
Signed-off-by: Jim Rees <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[upcall bugfixes]
Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/blocklayout/Makefile | 2 +-
fs/nfs/blocklayout/blocklayout.c | 42 ++++++++
fs/nfs/blocklayout/blocklayout.h | 40 +++++++
fs/nfs/blocklayout/blocklayoutdev.c | 191 +++++++++++++++++++++++++++++++++++
fs/nfs/client.c | 2 +-
include/linux/nfs.h | 2 +
6 files changed, 277 insertions(+), 2 deletions(-)
create mode 100644 fs/nfs/blocklayout/blocklayoutdev.c
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index 5cfadf6..5bf3409 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,4 @@
# Makefile for the pNFS block layout driver kernel module
#
obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-blocklayoutdriver-objs := blocklayout.o extents.o
+blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 291cc01..3afe363 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -31,6 +31,8 @@
*/
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
#include "blocklayout.h"
@@ -40,6 +42,9 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Andy Adamson <[email protected]>");
MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
+struct dentry *bl_device_pipe;
+wait_queue_head_t bl_wq;
+
static enum pnfs_try_status
bl_read_pagelist(struct nfs_read_data *rdata)
{
@@ -163,13 +168,49 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.clear_layoutdriver = bl_clear_layoutdriver,
};
+static const struct rpc_pipe_ops bl_upcall_ops = {
+ .upcall = bl_pipe_upcall,
+ .downcall = bl_pipe_downcall,
+ .destroy_msg = bl_pipe_destroy_msg,
+};
+
static int __init nfs4blocklayout_init(void)
{
+ struct nameidata nd;
+ struct path path;
int ret;
dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
ret = pnfs_register_layoutdriver(&blocklayout_type);
+ if (ret)
+ goto out;
+
+ init_waitqueue_head(&bl_wq);
+
+ path.mnt = rpc_get_mount();
+ if (IS_ERR(path.mnt)) {
+ ret = PTR_ERR(path.mnt);
+ goto out_remove;
+ }
+
+ ret = vfs_path_lookup(path.mnt->mnt_root,
+ path.mnt,
+ NFS_PIPE_DIRNAME, 0, &nd);
+ if (ret)
+ goto out_remove;
+
+ bl_device_pipe = rpc_mkpipe(nd.path.dentry, "blocklayout", NULL,
+ &bl_upcall_ops, 0);
+ if (IS_ERR(bl_device_pipe)) {
+ ret = PTR_ERR(bl_device_pipe);
+ goto out_remove;
+ }
+out:
+ return ret;
+
+out_remove:
+ pnfs_unregister_layoutdriver(&blocklayout_type);
return ret;
}
@@ -179,6 +220,7 @@ static void __exit nfs4blocklayout_exit(void)
__func__);
pnfs_unregister_layoutdriver(&blocklayout_type);
+ rpc_unlink(bl_device_pipe);
}
MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 825d651..002b996 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -34,8 +34,16 @@
#include <linux/device-mapper.h>
#include <linux/nfs_fs.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
#include "../pnfs.h"
+struct pnfs_block_dev {
+ struct list_head bm_node;
+ struct nfs4_deviceid bm_mdevid; /* associated devid */
+ struct block_device *bm_mdev; /* meta device itself */
+};
+
enum exstate4 {
PNFS_BLOCK_READWRITE_DATA = 0,
PNFS_BLOCK_READ_DATA = 1,
@@ -88,5 +96,37 @@ static inline struct pnfs_block_layout *BLK_LO2EXT(struct pnfs_layout_hdr *lo)
return container_of(lo, struct pnfs_block_layout, bl_layout);
}
+struct bl_dev_msg {
+ int status;
+ uint32_t major, minor;
+};
+
+struct bl_msg_hdr {
+ u8 type;
+ u16 totallen; /* length of entire message, including hdr itself */
+};
+
+extern struct dentry *bl_device_pipe;
+extern wait_queue_head_t bl_wq;
+
+#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
+#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
+#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
+#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
+#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
+
+/* blocklayoutdev.c */
+ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *,
+ char __user *, size_t);
+ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
+void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
+struct block_device *nfs4_blkdev_get(dev_t dev);
+int nfs4_blkdev_put(struct block_device *bdev);
+struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
+ struct pnfs_device *dev,
+ struct list_head *sdlist);
+int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
+
void bl_put_extent(struct pnfs_block_extent *be);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
new file mode 100644
index 0000000..7e1377f
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -0,0 +1,191 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayoutdev.c
+ *
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <[email protected]>
+ * Fred Isaman <[email protected]>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+#include <linux/module.h>
+#include <linux/buffer_head.h> /* __bread */
+
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+/* Open a block_device by device number. */
+struct block_device *nfs4_blkdev_get(dev_t dev)
+{
+ struct block_device *bd;
+
+ dprintk("%s enter\n", __func__);
+ bd = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ if (IS_ERR(bd))
+ goto fail;
+ return bd;
+fail:
+ dprintk("%s failed to open device : %ld\n",
+ __func__, PTR_ERR(bd));
+ return NULL;
+}
+
+/*
+ * Release the block device
+ */
+int nfs4_blkdev_put(struct block_device *bdev)
+{
+ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
+ MINOR(bdev->bd_dev));
+ return blkdev_put(bdev, FMODE_READ);
+}
+
+/*
+ * Shouldn't there be a rpc_generic_upcall() to do this for us?
+ */
+ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+ char __user *dst, size_t buflen)
+{
+ char *data = (char *)msg->data + msg->copied;
+ size_t mlen = min(msg->len - msg->copied, buflen);
+ unsigned long left;
+
+ left = copy_to_user(dst, data, mlen);
+ if (left == mlen) {
+ msg->errno = -EFAULT;
+ return -EFAULT;
+ }
+
+ mlen -= left;
+ msg->copied += mlen;
+ msg->errno = 0;
+ return mlen;
+}
+
+static struct bl_dev_msg bl_mount_reply;
+
+ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+ size_t mlen)
+{
+ if (mlen != sizeof (struct bl_dev_msg))
+ return -EINVAL;
+
+ if (copy_from_user(&bl_mount_reply, src, mlen) != 0)
+ return -EFAULT;
+
+ wake_up(&bl_wq);
+
+ return mlen;
+}
+
+void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ if (msg->errno >= 0)
+ return;
+ wake_up(&bl_wq);
+}
+
+/*
+ * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
+ */
+struct pnfs_block_dev *
+nfs4_blk_decode_device(struct nfs_server *server,
+ struct pnfs_device *dev,
+ struct list_head *sdlist)
+{
+ struct pnfs_block_dev *rv = NULL;
+ struct block_device *bd = NULL;
+ struct rpc_pipe_msg msg;
+ struct bl_msg_hdr bl_msg = {
+ .type = BL_DEVICE_MOUNT,
+ .totallen = dev->mincount,
+ };
+ uint8_t *dataptr;
+ DECLARE_WAITQUEUE(wq, current);
+ struct bl_dev_msg *reply = &bl_mount_reply;
+
+ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
+ dev->mincount);
+
+ memset(&msg, 0, sizeof(msg));
+ msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
+ if (!msg.data) {
+ rv = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+ dataptr = (uint8_t *) msg.data;
+ memcpy(&dataptr[sizeof(bl_msg)], dev->area, dev->mincount);
+ msg.len = sizeof(bl_msg) + dev->mincount;
+
+ dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+ add_wait_queue(&bl_wq, &wq);
+ if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+ remove_wait_queue(&bl_wq, &wq);
+ goto out;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&bl_wq, &wq);
+
+ if (reply->status != BL_DEVICE_REQUEST_PROC) {
+ dprintk("%s failed to open device: %d\n",
+ __func__, reply->status);
+ rv = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor));
+ if (IS_ERR(bd)) {
+ dprintk("%s failed to open device : %ld\n",
+ __func__, PTR_ERR(bd));
+ goto out;
+ }
+
+ rv = kzalloc(sizeof(*rv), GFP_NOFS);
+ if (!rv) {
+ rv = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ rv->bm_mdev = bd;
+ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
+ dprintk("%s Created device %s with bd_block_size %u\n",
+ __func__,
+ bd->bd_disk->disk_name,
+ bd->bd_block_size);
+
+out:
+ kfree(msg.data);
+ return rv;
+}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index de00a37..5833fbb 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -105,7 +105,7 @@ struct rpc_program nfs_program = {
.nrvers = ARRAY_SIZE(nfs_version),
.version = nfs_version,
.stats = &nfs_rpcstat,
- .pipe_dir_name = "/nfs",
+ .pipe_dir_name = NFS_PIPE_DIRNAME,
};
struct rpc_stat nfs_rpcstat = {
diff --git a/include/linux/nfs.h b/include/linux/nfs.h
index f387919..8c6ee44 100644
--- a/include/linux/nfs.h
+++ b/include/linux/nfs.h
@@ -29,6 +29,8 @@
#define NFS_MNT_VERSION 1
#define NFS_MNT3_VERSION 3
+#define NFS_PIPE_DIRNAME "/nfs"
+
/*
* NFS stats. The good thing with these values is that NFSv3 errors are
* a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which
--
1.7.4.1
From: Benny Halevy <[email protected]>
[pnfsblock: use pnfs_generic_pg_init_read/write]
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 14 ++++++++++++++
1 files changed, 14 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 16214df..190b61f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -767,6 +767,18 @@ bl_clear_layoutdriver(struct nfs_server *server)
return 0;
}
+static const struct nfs_pageio_ops bl_pg_read_ops = {
+ .pg_init = pnfs_generic_pg_init_read,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_readpages,
+};
+
+static const struct nfs_pageio_ops bl_pg_write_ops = {
+ .pg_init = pnfs_generic_pg_init_write,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_writepages,
+};
+
static struct pnfs_layoutdriver_type blocklayout_type = {
.id = LAYOUT_BLOCK_VOLUME,
.name = "LAYOUT_BLOCK_VOLUME",
@@ -780,6 +792,8 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.cleanup_layoutcommit = bl_cleanup_layoutcommit,
.set_layoutdriver = bl_set_layoutdriver,
.clear_layoutdriver = bl_clear_layoutdriver,
+ .pg_read_ops = &bl_pg_read_ops,
+ .pg_write_ops = &bl_pg_write_ops,
};
static const struct rpc_pipe_ops bl_upcall_ops = {
--
1.7.4.1
From: Fred Isaman <[email protected]>
[SQUASHME: pnfs: blocklayout: port block layout code]
Signed-off-by: Peng Tao <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 32 +++++++++++++
fs/nfs/blocklayout/blocklayout.h | 2 +
fs/nfs/blocklayout/extents.c | 95 ++++++++++++++++++++++++++++++++++++++
3 files changed, 129 insertions(+), 0 deletions(-)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 6d6ac0e..16214df 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -346,6 +346,30 @@ out:
return PNFS_NOT_ATTEMPTED;
}
+static void mark_extents_written(struct pnfs_block_layout *bl,
+ __u64 offset, __u32 count)
+{
+ sector_t isect, end;
+ struct pnfs_block_extent *be;
+
+ dprintk("%s(%llu, %u)\n", __func__, offset, count);
+ if (count == 0)
+ return;
+ isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
+ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
+ end >>= SECTOR_SHIFT;
+ while (isect < end) {
+ sector_t len;
+ be = bl_find_get_extent(bl, isect, NULL);
+ BUG_ON(!be); /* FIXME */
+ len = min(end, be->be_f_offset + be->be_length) - isect;
+ if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+ mark_for_commit(be, isect, len); /* What if fails? */
+ isect += len;
+ bl_put_extent(be);
+ }
+}
+
/* This is basically copied from mpage_end_io_read */
static void bl_end_io_write(struct bio *bio, int err)
{
@@ -372,6 +396,14 @@ static void bl_write_cleanup(struct work_struct *work)
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
wdata = container_of(task, struct nfs_write_data, task);
+ if (!wdata->task.tk_status) {
+ /* Marks for LAYOUTCOMMIT */
+ /* BUG - this should be called after each bio, not after
+ * all finish, unless have some way of storing success/failure
+ */
+ mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+ wdata->args.offset, wdata->args.count);
+ }
pnfs_ld_write_done(wdata);
}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 197f919..52986cc 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -202,5 +202,7 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
int status);
int bl_add_merge_extent(struct pnfs_block_layout *bl,
struct pnfs_block_extent *new);
+int mark_for_commit(struct pnfs_block_extent *be,
+ sector_t offset, sector_t length);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 483c235..a5c2851 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -217,6 +217,48 @@ int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect)
return rv;
}
+/* Assume start, end already sector aligned */
+static int
+_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
+{
+ struct pnfs_inval_tracking *pos;
+ u64 expect = 0;
+
+ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
+ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
+ if (pos->it_sector >= end)
+ continue;
+ if (!expect) {
+ if ((pos->it_sector == end - tree->mtt_step_size) &&
+ (pos->it_tags & (1 << tag))) {
+ expect = pos->it_sector - tree->mtt_step_size;
+ if (pos->it_sector < tree->mtt_step_size || expect < start)
+ return 1;
+ continue;
+ } else {
+ return 0;
+ }
+ }
+ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
+ return 0;
+ expect -= tree->mtt_step_size;
+ if (expect < start)
+ return 1;
+ }
+ return 0;
+}
+
+static int is_range_written(struct pnfs_inval_markings *marks,
+ sector_t start, sector_t end)
+{
+ int rv;
+
+ spin_lock(&marks->im_lock);
+ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
+ spin_unlock(&marks->im_lock);
+ return rv;
+}
+
/* Marks sectors in [offest, offset_length) as having been initialized.
* All lengths are step-aligned, where step is min(pagesize, blocksize).
* Notes where partial block is initialized, and helps prepare it for
@@ -396,6 +438,59 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
print_clist(clist, bl->bl_count);
}
+/* Note the range described by offset, length is guaranteed to be contained
+ * within be.
+ */
+int mark_for_commit(struct pnfs_block_extent *be,
+ sector_t offset, sector_t length)
+{
+ sector_t new_end, end = offset + length;
+ struct pnfs_block_short_extent *new;
+ struct pnfs_block_layout *bl = container_of(be->be_inval,
+ struct pnfs_block_layout,
+ bl_inval);
+
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new)
+ return -ENOMEM;
+
+ mark_written_sectors(be->be_inval, offset, length);
+ /* We want to add the range to commit list, but it must be
+ * block-normalized, and verified that the normalized range has
+ * been entirely written to disk.
+ */
+ new->bse_f_offset = offset;
+ offset = normalize(offset, bl->bl_blocksize);
+ if (offset < new->bse_f_offset) {
+ if (is_range_written(be->be_inval, offset, new->bse_f_offset))
+ new->bse_f_offset = offset;
+ else
+ new->bse_f_offset = offset + bl->bl_blocksize;
+ }
+ new_end = normalize_up(end, bl->bl_blocksize);
+ if (end < new_end) {
+ if (is_range_written(be->be_inval, end, new_end))
+ end = new_end;
+ else
+ end = new_end - bl->bl_blocksize;
+ }
+ if (end <= new->bse_f_offset) {
+ kfree(new);
+ return 0;
+ }
+ new->bse_length = end - new->bse_f_offset;
+ new->bse_devid = be->be_devid;
+ new->bse_mdev = be->be_mdev;
+
+ spin_lock(&bl->bl_ext_lock);
+ /* new will be freed, either by add_to_commitlist if it decides not
+ * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
+ */
+ add_to_commitlist(bl, new);
+ spin_unlock(&bl->bl_ext_lock);
+ return 0;
+}
+
static void print_bl_extent(struct pnfs_block_extent *be)
{
dprintk("PRINT EXTENT extent %p\n", be);
--
1.7.4.1
On 2011-07-21 15:34, Jim Rees wrote:
> From: Fred Isaman <[email protected]>
>
> Block layout needs it to determine IO size.
>
> Signed-off-by: Fred Isaman <[email protected]>
> Signed-off-by: Tao Guo <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/client.c | 1 +
> fs/nfs/nfs4_fs.h | 2 +-
> fs/nfs/nfs4proc.c | 5 +-
> fs/nfs/nfs4xdr.c | 99 +++++++++++++++++++++++++++++++++++++--------
> include/linux/nfs_fs_sb.h | 4 +-
> include/linux/nfs_xdr.h | 3 +-
> 6 files changed, 92 insertions(+), 22 deletions(-)
>
> diff --git a/fs/nfs/client.c b/fs/nfs/client.c
> index a9b1848..de00a37 100644
> --- a/fs/nfs/client.c
> +++ b/fs/nfs/client.c
> @@ -936,6 +936,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
> if (server->wsize > NFS_MAX_FILE_IO_SIZE)
> server->wsize = NFS_MAX_FILE_IO_SIZE;
> server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> + server->pnfs_blksize = fsinfo->blksize;
> set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype);
>
> server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
> diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
> index c30aed2..b7ad2f0 100644
> --- a/fs/nfs/nfs4_fs.h
> +++ b/fs/nfs/nfs4_fs.h
> @@ -318,7 +318,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
> extern const u32 nfs4_fattr_bitmap[2];
> extern const u32 nfs4_statfs_bitmap[2];
> extern const u32 nfs4_pathconf_bitmap[2];
> -extern const u32 nfs4_fsinfo_bitmap[2];
> +extern const u32 nfs4_fsinfo_bitmap[3];
> extern const u32 nfs4_fs_locations_bitmap[2];
>
> /* nfs4renewd.c */
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 784c1a2..e02f545 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -140,12 +140,13 @@ const u32 nfs4_pathconf_bitmap[2] = {
> 0
> };
>
> -const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
> +const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
> | FATTR4_WORD0_MAXREAD
> | FATTR4_WORD0_MAXWRITE
> | FATTR4_WORD0_LEASE_TIME,
> FATTR4_WORD1_TIME_DELTA
> - | FATTR4_WORD1_FS_LAYOUT_TYPES
> + | FATTR4_WORD1_FS_LAYOUT_TYPES,
> + FATTR4_WORD2_LAYOUT_BLKSIZE
> };
>
> const u32 nfs4_fs_locations_bitmap[2] = {
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index a82dd40..5ce3c64 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int);
> #define encode_restorefh_maxsz (op_encode_hdr_maxsz)
> #define decode_restorefh_maxsz (op_decode_hdr_maxsz)
> #define encode_fsinfo_maxsz (encode_getattr_maxsz)
> -#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15)
> +/* The 5 accounts for the PNFS attributes, and assumes that at most three
> + * layout types will be returned.
> + */
> +#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \
> + nfs4_fattr_bitmap_maxsz + 4 + 8 + 5)
> #define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
> #define decode_renew_maxsz (op_decode_hdr_maxsz)
> #define encode_setclientid_maxsz \
> @@ -1123,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
> hdr->replen += decode_getattr_maxsz;
> }
>
> +static void
> +encode_getattr_three(struct xdr_stream *xdr,
> + uint32_t bm0, uint32_t bm1, uint32_t bm2,
> + struct compound_hdr *hdr)
> +{
> + __be32 *p;
> +
> + p = reserve_space(xdr, 4);
> + *p = cpu_to_be32(OP_GETATTR);
> + if (bm2) {
> + p = reserve_space(xdr, 16);
> + *p++ = cpu_to_be32(3);
> + *p++ = cpu_to_be32(bm0);
> + *p++ = cpu_to_be32(bm1);
> + *p = cpu_to_be32(bm2);
> + } else if (bm1) {
> + p = reserve_space(xdr, 12);
> + *p++ = cpu_to_be32(2);
> + *p++ = cpu_to_be32(bm0);
> + *p = cpu_to_be32(bm1);
> + } else {
> + p = reserve_space(xdr, 8);
> + *p++ = cpu_to_be32(1);
> + *p = cpu_to_be32(bm0);
> + }
> + hdr->nops++;
> + hdr->replen += decode_getattr_maxsz;
> +}
> +
> static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
> {
> encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
> @@ -1131,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c
>
> static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
> {
> - encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
> - bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
> + encode_getattr_three(xdr,
> + bitmask[0] & nfs4_fsinfo_bitmap[0],
> + bitmask[1] & nfs4_fsinfo_bitmap[1],
> + bitmask[2] & nfs4_fsinfo_bitmap[2],
> + hdr);
> }
>
> static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
> @@ -2643,7 +2679,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
> struct compound_hdr hdr = {
> .nops = 0,
> };
> - const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
> + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
>
> encode_compound_hdr(xdr, req, &hdr);
> encode_setclientid_confirm(xdr, arg, &hdr);
> @@ -2787,7 +2823,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req,
> struct compound_hdr hdr = {
> .minorversion = nfs4_xdr_minorversion(&args->la_seq_args),
> };
> - const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
> + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
>
> encode_compound_hdr(xdr, req, &hdr);
> encode_sequence(xdr, &args->la_seq_args, &hdr);
> @@ -3068,14 +3104,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
> goto out_overflow;
> bmlen = be32_to_cpup(p);
>
> - bitmap[0] = bitmap[1] = 0;
> + bitmap[0] = bitmap[1] = bitmap[2] = 0;
> p = xdr_inline_decode(xdr, (bmlen << 2));
> if (unlikely(!p))
> goto out_overflow;
> if (bmlen > 0) {
> bitmap[0] = be32_to_cpup(p++);
> - if (bmlen > 1)
> - bitmap[1] = be32_to_cpup(p);
> + if (bmlen > 1) {
> + bitmap[1] = be32_to_cpup(p++);
> + if (bmlen > 2)
> + bitmap[2] = be32_to_cpup(p);
> + }
> }
> return 0;
> out_overflow:
> @@ -3107,8 +3146,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
> return ret;
> bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS;
> } else
> - bitmask[0] = bitmask[1] = 0;
> - dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]);
> + bitmask[0] = bitmask[1] = bitmask[2] = 0;
> + dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
> + bitmask[0], bitmask[1], bitmask[2]);
> return 0;
> }
>
> @@ -4162,7 +4202,7 @@ out_overflow:
> static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
> {
> __be32 *savep;
> - uint32_t attrlen, bitmap[2] = {0};
> + uint32_t attrlen, bitmap[3] = {0};
> int status;
>
> if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
> @@ -4188,7 +4228,7 @@ xdr_error:
> static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
> {
> __be32 *savep;
> - uint32_t attrlen, bitmap[2] = {0};
> + uint32_t attrlen, bitmap[3] = {0};
> int status;
>
> if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
> @@ -4220,7 +4260,7 @@ xdr_error:
> static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
> {
> __be32 *savep;
> - uint32_t attrlen, bitmap[2] = {0};
> + uint32_t attrlen, bitmap[3] = {0};
> int status;
>
> if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
> @@ -4360,7 +4400,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
> {
> __be32 *savep;
> uint32_t attrlen,
> - bitmap[2] = {0};
> + bitmap[3] = {0};
> int status;
>
> status = decode_op_hdr(xdr, OP_GETATTR);
> @@ -4446,10 +4486,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
> return status;
> }
>
> +/*
> + * The prefered block size for layout directed io
> + */
> +static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
> + uint32_t *res)
> +{
> + __be32 *p;
> +
> + dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
> + *res = 0;
> + if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) {
> + p = xdr_inline_decode(xdr, 4);
> + if (unlikely(!p)) {
> + print_overflow_msg(__func__, xdr);
> + return -EIO;
> + }
> + *res = be32_to_cpup(p);
> + bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE;
> + }
> + return 0;
> +}
> +
> static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
> {
> __be32 *savep;
> - uint32_t attrlen, bitmap[2];
> + uint32_t attrlen, bitmap[3];
> int status;
>
> if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
> @@ -4477,6 +4539,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
> status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
> if (status != 0)
> goto xdr_error;
> + status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
> + if (status)
> + goto xdr_error;
>
> status = verify_attr_len(xdr, savep, attrlen);
> xdr_error:
> @@ -4896,7 +4961,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
> {
> __be32 *savep;
> uint32_t attrlen,
> - bitmap[2] = {0};
> + bitmap[3] = {0};
> struct kvec *iov = req->rq_rcv_buf.head;
> int status;
>
> @@ -6849,7 +6914,7 @@ out:
> int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
> int plus)
> {
> - uint32_t bitmap[2] = {0};
> + uint32_t bitmap[3] = {0};
> uint32_t len;
> __be32 *p = xdr_inline_decode(xdr, 4);
> if (unlikely(!p))
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 4faeac8..6e6ab4a 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -132,7 +132,7 @@ struct nfs_server {
> #endif
>
> #ifdef CONFIG_NFS_V4
> - u32 attr_bitmask[2];/* V4 bitmask representing the set
> + u32 attr_bitmask[3];/* V4 bitmask representing the set
> of attributes supported on this
> filesystem */
> u32 cache_consistency_bitmask[2];
> @@ -145,6 +145,8 @@ struct nfs_server {
> filesystem */
> struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */
> struct rpc_wait_queue roc_rpcwaitq;
> + void *pnfs_ld_data; /* per mount point data */
pnfs_ld_data seems to be used first only in [PATCH 13/25]
"pnfsblock: call and parse getdevicelist"
Benny
> + u32 pnfs_blksize; /* layout_blksize attr */
>
> /* the following fields are protected by nfs_client->cl_lock */
> struct rb_root state_owners;
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index 21f333e..94f27e5 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -122,6 +122,7 @@ struct nfs_fsinfo {
> struct timespec time_delta; /* server time granularity */
> __u32 lease_time; /* in seconds */
> __u32 layouttype; /* supported pnfs layout driver */
> + __u32 blksize; /* preferred pnfs io block size */
> };
>
> struct nfs_fsstat {
> @@ -954,7 +955,7 @@ struct nfs4_server_caps_arg {
> };
>
> struct nfs4_server_caps_res {
> - u32 attr_bitmask[2];
> + u32 attr_bitmask[3];
> u32 acl_bitmask;
> u32 has_links;
> u32 has_symlinks;
From: Fred Isaman <[email protected]>
Adds structures and basic create/delete code for extents.
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Zhang Jingwang <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
---
fs/nfs/blocklayout/Makefile | 2 +-
fs/nfs/blocklayout/blocklayout.c | 20 ++++++--
fs/nfs/blocklayout/blocklayout.h | 1 +
fs/nfs/blocklayout/extents.c | 97 ++++++++++++++++++++++++++++++++++++++
4 files changed, 115 insertions(+), 5 deletions(-)
create mode 100644 fs/nfs/blocklayout/extents.c
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index 6bf49cd..5cfadf6 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,4 @@
# Makefile for the pNFS block layout driver kernel module
#
obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-blocklayoutdriver-objs := blocklayout.o
+blocklayoutdriver-objs := blocklayout.o extents.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 55a2a95..291cc01 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -53,12 +53,24 @@ bl_write_pagelist(struct nfs_write_data *wdata,
return PNFS_NOT_ATTEMPTED;
}
-/* STUB */
+/* FIXME - range ignored */
static void
-release_extents(struct pnfs_block_layout *bl,
- struct pnfs_layout_range *range)
+release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
{
- return;
+ int i;
+ struct pnfs_block_extent *be;
+
+ spin_lock(&bl->bl_ext_lock);
+ for (i = 0; i < EXTENT_LISTS; i++) {
+ while (!list_empty(&bl->bl_extents[i])) {
+ be = list_first_entry(&bl->bl_extents[i],
+ struct pnfs_block_extent,
+ be_node);
+ list_del(&be->be_node);
+ bl_put_extent(be);
+ }
+ }
+ spin_unlock(&bl->bl_ext_lock);
}
/* STUB */
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index bda87e0..825d651 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -88,4 +88,5 @@ static inline struct pnfs_block_layout *BLK_LO2EXT(struct pnfs_layout_hdr *lo)
return container_of(lo, struct pnfs_block_layout, bl_layout);
}
+void bl_put_extent(struct pnfs_block_extent *be);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
new file mode 100644
index 0000000..44c3364
--- /dev/null
+++ b/fs/nfs/blocklayout/extents.c
@@ -0,0 +1,97 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayout.h
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <[email protected]>
+ * Fred Isaman <[email protected]>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include "blocklayout.h"
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void print_bl_extent(struct pnfs_block_extent *be)
+{
+ dprintk("PRINT EXTENT extent %p\n", be);
+ if (be) {
+ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
+ dprintk(" be_length %llu\n", (u64)be->be_length);
+ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
+ dprintk(" be_state %d\n", be->be_state);
+ }
+}
+
+static void
+destroy_extent(struct kref *kref)
+{
+ struct pnfs_block_extent *be;
+
+ be = container_of(kref, struct pnfs_block_extent, be_refcnt);
+ dprintk("%s be=%p\n", __func__, be);
+ kfree(be);
+}
+
+void
+bl_put_extent(struct pnfs_block_extent *be)
+{
+ if (be) {
+ dprintk("%s enter %p (%i)\n", __func__, be,
+ atomic_read(&be->be_refcnt.refcount));
+ kref_put(&be->be_refcnt, destroy_extent);
+ }
+}
+
+struct pnfs_block_extent *alloc_extent(void)
+{
+ struct pnfs_block_extent *be;
+
+ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
+ if (!be)
+ return NULL;
+ INIT_LIST_HEAD(&be->be_node);
+ kref_init(&be->be_refcnt);
+ be->be_inval = NULL;
+ return be;
+}
+
+struct pnfs_block_extent *
+get_extent(struct pnfs_block_extent *be)
+{
+ if (be)
+ kref_get(&be->be_refcnt);
+ return be;
+}
+
+void print_elist(struct list_head *list)
+{
+ struct pnfs_block_extent *be;
+ dprintk("****************\n");
+ dprintk("Extent list looks like:\n");
+ list_for_each_entry(be, list, be_node) {
+ print_bl_extent(be);
+ }
+ dprintk("****************\n");
+}
--
1.7.4.1
Signed-off-by: Jim Rees <[email protected]>
Signed-off-by: Fred Isaman <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
Signed-off-by: Benny Halevy <[email protected]>
[upcall bugfixes]
Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/blocklayout/Makefile | 2 +-
fs/nfs/blocklayout/blocklayout.h | 3 +
fs/nfs/blocklayout/blocklayoutdm.c | 111 ++++++++++++++++++++++++++++++++++++
3 files changed, 115 insertions(+), 1 deletions(-)
create mode 100644 fs/nfs/blocklayout/blocklayoutdm.c
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index 5bf3409..d581550 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,4 @@
# Makefile for the pNFS block layout driver kernel module
#
obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o
+blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 002b996..08b4d6f 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -128,5 +128,8 @@ struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
+/* blocklayoutdm.c */
+void free_block_dev(struct pnfs_block_dev *bdev);
+
void bl_put_extent(struct pnfs_block_extent *be);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
new file mode 100644
index 0000000..eab95f3
--- /dev/null
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -0,0 +1,111 @@
+/*
+ * linux/fs/nfs/blocklayout/blocklayoutdm.c
+ *
+ * Module for the NFSv4.1 pNFS block layout driver.
+ *
+ * Copyright (c) 2007 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Fred Isaman <[email protected]>
+ * Andy Adamson <[email protected]>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/genhd.h> /* gendisk - used in a dprintk*/
+#include <linux/sched.h>
+#include <linux/hash.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void dev_remove(dev_t dev)
+{
+ struct rpc_pipe_msg msg;
+ struct bl_dev_msg bl_umount_request;
+ struct bl_msg_hdr bl_msg = {
+ .type = BL_DEVICE_UMOUNT,
+ .totallen = sizeof(bl_umount_request),
+ };
+ uint8_t *dataptr;
+ DECLARE_WAITQUEUE(wq, current);
+
+ dprintk("Entering %s\n", __func__);
+
+ memset(&msg, 0, sizeof(msg));
+ msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
+ if (!msg.data)
+ goto out;
+
+ memset(&bl_umount_request, 0, sizeof(bl_umount_request));
+ bl_umount_request.major = MAJOR(dev);
+ bl_umount_request.minor = MINOR(dev);
+
+ memcpy(msg.data, &bl_msg, sizeof(bl_msg));
+ dataptr = (uint8_t *) msg.data;
+ memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
+ msg.len = sizeof(bl_msg) + bl_msg.totallen;
+
+ add_wait_queue(&bl_wq, &wq);
+ if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) {
+ remove_wait_queue(&bl_wq, &wq);
+ goto out;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&bl_wq, &wq);
+
+out:
+ kfree(msg.data);
+}
+
+/*
+ * Release meta device
+ */
+static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
+{
+ int rv;
+
+ dprintk("%s Releasing\n", __func__);
+ rv = nfs4_blkdev_put(bdev->bm_mdev);
+ if (rv)
+ printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n",
+ __func__, rv);
+
+ dev_remove(bdev->bm_mdev->bd_dev);
+}
+
+void free_block_dev(struct pnfs_block_dev *bdev)
+{
+ if (bdev) {
+ if (bdev->bm_mdev) {
+ dprintk("%s Removing DM device: %d:%d\n",
+ __func__,
+ MAJOR(bdev->bm_mdev->bd_dev),
+ MINOR(bdev->bm_mdev->bd_dev));
+ nfs4_blk_metadev_release(bdev);
+ }
+ kfree(bdev);
+ }
+}
--
1.7.4.1
On 2011-07-21 15:34, Jim Rees wrote:
> From: Andy Adamson <[email protected]>
>
> This gives layout driver a chance to cleanup structures they put in at
> encode_layoutcommit.
>
> Signed-off-by: Andy Adamson <[email protected]>
> [fixup layout header pointer for layoutcommit]
> Signed-off-by: Benny Halevy <[email protected]>
> Signed-off-by: Benny Halevy <[email protected]>
> ---
> fs/nfs/nfs4proc.c | 1 +
> fs/nfs/nfs4xdr.c | 3 ++-
> fs/nfs/pnfs.c | 10 ++++++++++
> fs/nfs/pnfs.h | 5 +++++
> include/linux/nfs_xdr.h | 1 +
> 5 files changed, 19 insertions(+), 1 deletions(-)
>
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index e02f545..795033c5 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -5965,6 +5965,7 @@ static void nfs4_layoutcommit_release(void *calldata)
> struct nfs4_layoutcommit_data *data = calldata;
> struct pnfs_layout_segment *lseg, *tmp;
>
> + pnfs_cleanup_layoutcommit(data->args.inode, data);
> /* Matched by references in pnfs_set_layoutcommit */
> list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) {
> list_del_init(&lseg->pls_lc_list);
> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
> index 5ce3c64..07c41b2 100644
> --- a/fs/nfs/nfs4xdr.c
> +++ b/fs/nfs/nfs4xdr.c
> @@ -1991,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
> *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
> /* Only whole file layouts */
> p = xdr_encode_hyper(p, 0); /* offset */
> - p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */
> + p = xdr_encode_hyper(p, args->lastbytewritten+1); /* length */
This particular change seems to be related to [PATCH 03/25]
"NFS41: Let layoutcommit handle multiple segments"
Benny
> *p++ = cpu_to_be32(0); /* reclaim */
> p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
> *p++ = cpu_to_be32(1); /* newoffset = TRUE */
> @@ -5596,6 +5596,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr,
> int status;
>
> status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
> + res->status = status;
> if (status)
> return status;
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index fbebd2a..3b20753 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1411,6 +1411,16 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
> }
> EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
>
> +void pnfs_cleanup_layoutcommit(struct inode *inode,
> + struct nfs4_layoutcommit_data *data)
> +{
> + struct nfs_server *nfss = NFS_SERVER(inode);
> +
> + if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
> + nfss->pnfs_curr_ld->cleanup_layoutcommit(NFS_I(inode)->layout,
> + data);
> +}
> +
> /*
> * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
> * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index bddd8b9..f271425 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -113,6 +113,9 @@ struct pnfs_layoutdriver_type {
> struct xdr_stream *xdr,
> const struct nfs4_layoutreturn_args *args);
>
> + void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid,
> + struct nfs4_layoutcommit_data *data);
> +
> void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
> struct xdr_stream *xdr,
> const struct nfs4_layoutcommit_args *args);
> @@ -196,6 +199,8 @@ void pnfs_roc_release(struct inode *ino);
> void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
> bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
> void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
> +void pnfs_cleanup_layoutcommit(struct inode *inode,
> + struct nfs4_layoutcommit_data *data);
> int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
> int _pnfs_return_layout(struct inode *);
> int pnfs_ld_write_done(struct nfs_write_data *);
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index 94f27e5..569ea5b 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -269,6 +269,7 @@ struct nfs4_layoutcommit_res {
> struct nfs_fattr *fattr;
> const struct nfs_server *server;
> struct nfs4_sequence_res seq_res;
> + int status;
> };
>
> struct nfs4_layoutcommit_data {