2014-09-03 04:36:23

by Christoph Hellwig

[permalink] [raw]
Subject: pnfs: add CB_NOTIFY_DEVICEID support

I know everyone was waiting for more pnfs patches for me, so today we'll have
some special bites that seem way out in crack monkey land but are for real:

- more kernel code!
- duplicating existing raid drivers!
- XDR encoding of previously XDR decoded data!

If you're not scared read on..

This series adds support for device change notifications to the pnfs core
and block layout driver. To properly implement NOTIFY_DEVICEID4_CHANGE I
had to move GETDEVICEINFO decoding into kernel space, and implement my own
trivial striping and concat layers in the pnfs blocklayout client. Why?
DM and MD are really, really unhappy about multiple raid devices using the
same physical devices. Which is fairly understandable from their point of
view, given that they normally manage on-disk metadata and would rather do
that exclusively. But we get all our metadata from the server and couldn't
care less about their problems, we have our own problem called deviceid
management.. Once we get a NOTIFY_DEVICEID4_CHANGE callback we need
to stop new I/O using the existing mapping, so we have to drop it from
the deviceid cache. At the same time another layout could easily request
the same deviceid again and for some period we'd have multiple volume
structures pointing to the same device.

Note that we do not break userspace compatibility - we still use blkmapd
to translate signatures to device numbers, and to do so we re-encode the
SIMPLE volumes we need translated to XDR and send it up to blkmapd just
for that.

A git tree is also available at

git://git.infradead.org/users/hch/pnfs.git notify-deviceid

This work was sponsored by NetApp, Inc.



2014-09-03 04:36:30

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 2/6] pnfs/blocklayout: move extent processing to blocklayout.c

This isn't device(id) related, so move it into the main file. Simple move
for now, the next commit will clean it up a bit.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 186 ++++++++++++++++++++++++++++++++++++
fs/nfs/blocklayout/blocklayout.h | 2 -
fs/nfs/blocklayout/blocklayoutdev.c | 186 ------------------------------------
3 files changed, 186 insertions(+), 188 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index ff8b43b..df8c567 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -446,6 +446,192 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg)
kfree(lseg);
}

+/* Tracks info needed to ensure extents in layout obey constraints of spec */
+struct layout_verification {
+ u32 mode; /* R or RW */
+ u64 start; /* Expected start of next non-COW extent */
+ u64 inval; /* Start of INVAL coverage */
+ u64 cowread; /* End of COW read coverage */
+};
+
+/* Verify the extent meets the layout requirements of the pnfs-block draft,
+ * section 2.3.1.
+ */
+static int verify_extent(struct pnfs_block_extent *be,
+ struct layout_verification *lv)
+{
+ if (lv->mode == IOMODE_READ) {
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
+ be->be_state == PNFS_BLOCK_INVALID_DATA)
+ return -EIO;
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ }
+ /* lv->mode == IOMODE_RW */
+ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ if (lv->cowread > lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ lv->inval = lv->start;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+ if (be->be_f_offset != lv->start)
+ return -EIO;
+ lv->start += be->be_length;
+ return 0;
+ } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
+ if (be->be_f_offset > lv->start)
+ return -EIO;
+ if (be->be_f_offset < lv->inval)
+ return -EIO;
+ if (be->be_f_offset < lv->cowread)
+ return -EIO;
+ /* It looks like you might want to min this with lv->start,
+ * but you really don't.
+ */
+ lv->inval = lv->inval + be->be_length;
+ lv->cowread = be->be_f_offset + be->be_length;
+ return 0;
+ } else
+ return -EIO;
+}
+
+static int decode_sector_number(__be32 **rp, sector_t *sp)
+{
+ uint64_t s;
+
+ *rp = xdr_decode_hyper(*rp, &s);
+ if (s & 0x1ff) {
+ printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
+ return -1;
+ }
+ *sp = s >> SECTOR_SHIFT;
+ return 0;
+}
+
+/* XDR decode pnfs_block_layout4 structure */
+static int
+nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
+ struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
+{
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ int i, status = -EIO;
+ uint32_t count;
+ struct pnfs_block_extent *be = NULL, *save;
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ __be32 *p;
+ struct layout_verification lv = {
+ .mode = lgr->range.iomode,
+ .start = lgr->range.offset >> SECTOR_SHIFT,
+ .inval = lgr->range.offset >> SECTOR_SHIFT,
+ .cowread = lgr->range.offset >> SECTOR_SHIFT,
+ };
+ LIST_HEAD(extents);
+
+ dprintk("---> %s\n", __func__);
+
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ return -ENOMEM;
+
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err;
+
+ count = be32_to_cpup(p++);
+
+ dprintk("%s enter, number of extents %i\n", __func__, count);
+ p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
+ if (unlikely(!p))
+ goto out_err;
+
+ /* Decode individual extents, putting them in temporary
+ * staging area until whole layout is decoded to make error
+ * recovery easier.
+ */
+ for (i = 0; i < count; i++) {
+ struct nfs4_deviceid id;
+
+ be = kzalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
+ if (!be) {
+ status = -ENOMEM;
+ goto out_err;
+ }
+ memcpy(&id, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ be->be_device =
+ nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+ lo->plh_lc_cred, gfp_flags);
+ if (!be->be_device)
+ goto out_err;
+
+ /* The next three values are read in as bytes,
+ * but stored as 512-byte sector lengths
+ */
+ if (decode_sector_number(&p, &be->be_f_offset) < 0)
+ goto out_err;
+ if (decode_sector_number(&p, &be->be_length) < 0)
+ goto out_err;
+ if (decode_sector_number(&p, &be->be_v_offset) < 0)
+ goto out_err;
+ be->be_state = be32_to_cpup(p++);
+ if (verify_extent(be, &lv)) {
+ dprintk("%s verify failed\n", __func__);
+ goto out_err;
+ }
+ list_add_tail(&be->be_list, &extents);
+ }
+ if (lgr->range.offset + lgr->range.length !=
+ lv.start << SECTOR_SHIFT) {
+ dprintk("%s Final length mismatch\n", __func__);
+ be = NULL;
+ goto out_err;
+ }
+ if (lv.start < lv.cowread) {
+ dprintk("%s Final uncovered COW extent\n", __func__);
+ be = NULL;
+ goto out_err;
+ }
+ /* Extents decoded properly, now try to merge them in to
+ * existing layout extents.
+ */
+ list_for_each_entry_safe(be, save, &extents, be_list) {
+ list_del(&be->be_list);
+
+ status = ext_tree_insert(bl, be);
+ if (status)
+ goto out_free_list;
+ }
+ status = 0;
+ out:
+ __free_page(scratch);
+ dprintk("%s returns %i\n", __func__, status);
+ return status;
+
+ out_err:
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ out_free_list:
+ while (!list_empty(&extents)) {
+ be = list_first_entry(&extents, struct pnfs_block_extent,
+ be_list);
+ list_del(&be->be_list);
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
+ goto out;
+}
+
/* We pretty much ignore lseg, and store all data layout wide, so we
* can correctly merge.
*/
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9757f3e..00c11eb 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -113,8 +113,6 @@ struct bl_msg_hdr {
/* blocklayoutdev.c */
ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);

struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
struct pnfs_device *pdev, gfp_t gfp_mask);
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index d6527d2..2b54e29 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -40,19 +40,6 @@

#define NFSDBG_FACILITY NFSDBG_PNFS_LD

-static int decode_sector_number(__be32 **rp, sector_t *sp)
-{
- uint64_t s;
-
- *rp = xdr_decode_hyper(*rp, &s);
- if (s & 0x1ff) {
- printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
- return -1;
- }
- *sp = s >> SECTOR_SHIFT;
- return 0;
-}
-
ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
size_t mlen)
{
@@ -183,176 +170,3 @@ bl_free_deviceid_node(struct nfs4_deviceid_node *d)

kfree(dev);
}
-
-/* Tracks info needed to ensure extents in layout obey constraints of spec */
-struct layout_verification {
- u32 mode; /* R or RW */
- u64 start; /* Expected start of next non-COW extent */
- u64 inval; /* Start of INVAL coverage */
- u64 cowread; /* End of COW read coverage */
-};
-
-/* Verify the extent meets the layout requirements of the pnfs-block draft,
- * section 2.3.1.
- */
-static int verify_extent(struct pnfs_block_extent *be,
- struct layout_verification *lv)
-{
- if (lv->mode == IOMODE_READ) {
- if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
- be->be_state == PNFS_BLOCK_INVALID_DATA)
- return -EIO;
- if (be->be_f_offset != lv->start)
- return -EIO;
- lv->start += be->be_length;
- return 0;
- }
- /* lv->mode == IOMODE_RW */
- if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
- if (be->be_f_offset != lv->start)
- return -EIO;
- if (lv->cowread > lv->start)
- return -EIO;
- lv->start += be->be_length;
- lv->inval = lv->start;
- return 0;
- } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
- if (be->be_f_offset != lv->start)
- return -EIO;
- lv->start += be->be_length;
- return 0;
- } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
- if (be->be_f_offset > lv->start)
- return -EIO;
- if (be->be_f_offset < lv->inval)
- return -EIO;
- if (be->be_f_offset < lv->cowread)
- return -EIO;
- /* It looks like you might want to min this with lv->start,
- * but you really don't.
- */
- lv->inval = lv->inval + be->be_length;
- lv->cowread = be->be_f_offset + be->be_length;
- return 0;
- } else
- return -EIO;
-}
-
-/* XDR decode pnfs_block_layout4 structure */
-int
-nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
-{
- struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
- int i, status = -EIO;
- uint32_t count;
- struct pnfs_block_extent *be = NULL, *save;
- struct xdr_stream stream;
- struct xdr_buf buf;
- struct page *scratch;
- __be32 *p;
- struct layout_verification lv = {
- .mode = lgr->range.iomode,
- .start = lgr->range.offset >> SECTOR_SHIFT,
- .inval = lgr->range.offset >> SECTOR_SHIFT,
- .cowread = lgr->range.offset >> SECTOR_SHIFT,
- };
- LIST_HEAD(extents);
-
- dprintk("---> %s\n", __func__);
-
- scratch = alloc_page(gfp_flags);
- if (!scratch)
- return -ENOMEM;
-
- xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
-
- p = xdr_inline_decode(&stream, 4);
- if (unlikely(!p))
- goto out_err;
-
- count = be32_to_cpup(p++);
-
- dprintk("%s enter, number of extents %i\n", __func__, count);
- p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
- if (unlikely(!p))
- goto out_err;
-
- /* Decode individual extents, putting them in temporary
- * staging area until whole layout is decoded to make error
- * recovery easier.
- */
- for (i = 0; i < count; i++) {
- struct nfs4_deviceid id;
-
- be = kzalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
- if (!be) {
- status = -ENOMEM;
- goto out_err;
- }
- memcpy(&id, p, NFS4_DEVICEID4_SIZE);
- p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-
- be->be_device =
- nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
- lo->plh_lc_cred, gfp_flags);
- if (!be->be_device)
- goto out_err;
-
- /* The next three values are read in as bytes,
- * but stored as 512-byte sector lengths
- */
- if (decode_sector_number(&p, &be->be_f_offset) < 0)
- goto out_err;
- if (decode_sector_number(&p, &be->be_length) < 0)
- goto out_err;
- if (decode_sector_number(&p, &be->be_v_offset) < 0)
- goto out_err;
- be->be_state = be32_to_cpup(p++);
- if (verify_extent(be, &lv)) {
- dprintk("%s verify failed\n", __func__);
- goto out_err;
- }
- list_add_tail(&be->be_list, &extents);
- }
- if (lgr->range.offset + lgr->range.length !=
- lv.start << SECTOR_SHIFT) {
- dprintk("%s Final length mismatch\n", __func__);
- be = NULL;
- goto out_err;
- }
- if (lv.start < lv.cowread) {
- dprintk("%s Final uncovered COW extent\n", __func__);
- be = NULL;
- goto out_err;
- }
- /* Extents decoded properly, now try to merge them in to
- * existing layout extents.
- */
- list_for_each_entry_safe(be, save, &extents, be_list) {
- list_del(&be->be_list);
-
- status = ext_tree_insert(bl, be);
- if (status)
- goto out_free_list;
- }
- status = 0;
- out:
- __free_page(scratch);
- dprintk("%s returns %i\n", __func__, status);
- return status;
-
- out_err:
- nfs4_put_deviceid_node(be->be_device);
- kfree(be);
- out_free_list:
- while (!list_empty(&extents)) {
- be = list_first_entry(&extents, struct pnfs_block_extent,
- be_list);
- list_del(&be->be_list);
- nfs4_put_deviceid_node(be->be_device);
- kfree(be);
- }
- goto out;
-}
--
1.9.1


2014-09-03 04:36:43

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 5/6] pnfs/blocklayout: in-kernel GETDEVICEINFO XDR parsing

This patches moves parsing of the GETDEVICEINFO XDR to kernel space, as well
as the management of complex devices. The reason for that is we might have
multiple outstanding complex devices after a NOTIFY_DEVICEID4_CHANGE, which
device mapper or md can't handle as they claim devices exclusively.

But as is turns out simple striping / concatenation is fairly trivial to
implement anyway, so we make our life simpler by reducing the reliance
on blkmapd. For now we still use blkmapd by feeding it synthetic SIMPLE
device XDR to translate device signatures to device numbers, but in the
long runs I have plans to eliminate it entirely.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/nfs/blocklayout/Makefile | 2 +-
fs/nfs/blocklayout/blocklayout.c | 92 ++++++----
fs/nfs/blocklayout/blocklayout.h | 81 ++++++++-
fs/nfs/blocklayout/dev.c | 358 +++++++++++++++++++++++++++++++++++++++
fs/nfs/blocklayout/rpc_pipefs.c | 141 ++++-----------
5 files changed, 526 insertions(+), 148 deletions(-)
create mode 100644 fs/nfs/blocklayout/dev.c

diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index e177026..3ca14c3 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -3,4 +3,4 @@
#
obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o

-blocklayoutdriver-y += blocklayout.o extent_tree.o rpc_pipefs.o
+blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 7b3c8c9..e92591c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -114,13 +114,10 @@ bl_submit_bio(int rw, struct bio *bio)
return NULL;
}

-static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
- struct parallel_io *par)
+static struct bio *
+bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
+ void (*end_io)(struct bio *, int err), struct parallel_io *par)
{
- struct pnfs_block_dev *dev =
- container_of(be->be_device, struct pnfs_block_dev, d_node);
struct bio *bio;

npg = min(npg, BIO_MAX_PAGES);
@@ -131,32 +128,55 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
}

if (bio) {
- bio->bi_iter.bi_sector = isect - be->be_f_offset +
- be->be_v_offset;
- bio->bi_bdev = dev->d_bdev;
+ bio->bi_iter.bi_sector = disk_sector;
+ bio->bi_bdev = bdev;
bio->bi_end_io = end_io;
bio->bi_private = par;
}
return bio;
}

-static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
- sector_t isect, struct page *page,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
- struct parallel_io *par,
- unsigned int offset, int len)
+static struct bio *
+do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
+ struct page *page, struct pnfs_block_dev_map *map,
+ struct pnfs_block_extent *be,
+ void (*end_io)(struct bio *, int err),
+ struct parallel_io *par, unsigned int offset, int *len)
{
- isect = isect + (offset >> SECTOR_SHIFT);
+ struct pnfs_block_dev *dev =
+ container_of(be->be_device, struct pnfs_block_dev, node);
+ u64 disk_addr, end;
+
dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
- npg, rw, (unsigned long long)isect, offset, len);
+ npg, rw, (unsigned long long)isect, offset, *len);
+
+ /* translate to device offset */
+ isect += be->be_v_offset;
+ isect -= be->be_f_offset;
+
+ /* translate to physical disk offset */
+ disk_addr = (u64)isect << SECTOR_SHIFT;
+ if (disk_addr < map->start || disk_addr >= map->start + map->len) {
+ if (!dev->map(dev, disk_addr, map))
+ return ERR_PTR(-EIO);
+ bio = bl_submit_bio(rw, bio);
+ }
+ disk_addr += map->disk_offset;
+ disk_addr -= map->start;
+
+ /* limit length to what the device mapping allows */
+ end = disk_addr + *len;
+ if (end >= map->start + map->len)
+ *len = map->start + map->len - disk_addr;
+
retry:
if (!bio) {
- bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
+ bio = bl_alloc_init_bio(npg, map->bdev,
+ disk_addr >> SECTOR_SHIFT, end_io, par);
if (!bio)
return ERR_PTR(-ENOMEM);
}
- if (bio_add_page(bio, page, len, offset) < len) {
+ if (bio_add_page(bio, page, *len, offset) < *len) {
bio = bl_submit_bio(rw, bio);
goto retry;
}
@@ -203,6 +223,7 @@ static enum pnfs_try_status
bl_read_pagelist(struct nfs_pgio_header *header)
{
struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
struct pnfs_block_extent be;
sector_t isect, extent_length = 0;
@@ -248,28 +269,29 @@ bl_read_pagelist(struct nfs_pgio_header *header)
pg_len = PAGE_CACHE_SIZE - pg_offset;
else
pg_len = bytes_left;
-
- f_offset += pg_len;
- bytes_left -= pg_len;
- isect += (pg_offset >> SECTOR_SHIFT);
- extent_length -= (pg_offset >> SECTOR_SHIFT);
} else {
BUG_ON(pg_offset != 0);
pg_len = PAGE_CACHE_SIZE;
}

+ isect += (pg_offset >> SECTOR_SHIFT);
+ extent_length -= (pg_offset >> SECTOR_SHIFT);
+
if (is_hole(&be)) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
dprintk("%s Zeroing page for hole\n", __func__);
zero_user_segment(pages[i], pg_offset, pg_len);
+
+ /* invalidate map */
+ map.start = NFS4_MAX_UINT64;
} else {
bio = do_add_page_to_bio(bio,
header->page_array.npages - i,
READ,
- isect, pages[i], &be,
+ isect, pages[i], &map, &be,
bl_end_io_read, par,
- pg_offset, pg_len);
+ pg_offset, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
@@ -278,6 +300,8 @@ bl_read_pagelist(struct nfs_pgio_header *header)
}
isect += (pg_len >> SECTOR_SHIFT);
extent_length -= (pg_len >> SECTOR_SHIFT);
+ f_offset += pg_len;
+ bytes_left -= pg_len;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
header->res.eof = 1;
@@ -346,6 +370,7 @@ static enum pnfs_try_status
bl_write_pagelist(struct nfs_pgio_header *header, int sync)
{
struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
+ struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
struct pnfs_block_extent be;
sector_t isect, extent_length = 0;
@@ -354,6 +379,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
size_t count = header->args.count;
struct page **pages = header->args.pages;
int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+ unsigned int pg_len;
struct blk_plug plug;
int i;

@@ -387,19 +413,21 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
extent_length = be.be_length - (isect - be.be_f_offset);
}

+ pg_len = PAGE_CACHE_SIZE;
bio = do_add_page_to_bio(bio, header->page_array.npages - i,
- WRITE, isect, pages[i], &be,
+ WRITE, isect, pages[i], &map, &be,
bl_end_io_write, par,
- 0, PAGE_CACHE_SIZE);
+ 0, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
goto out;
}
- offset += PAGE_CACHE_SIZE;
- count -= PAGE_CACHE_SIZE;
- isect += PAGE_CACHE_SECTORS;
- extent_length -= PAGE_CACHE_SECTORS;
+
+ offset += pg_len;
+ count -= pg_len;
+ isect += (pg_len >> SECTOR_SHIFT);
+ extent_length -= (pg_len >> SECTOR_SHIFT);
}

header->res.count = header->args.count;
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index c98d98a..3077391 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,9 +44,75 @@
#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
#define SECTOR_SIZE (1 << SECTOR_SHIFT)

+struct pnfs_block_dev;
+
+enum pnfs_block_volume_type {
+ PNFS_BLOCK_VOLUME_SIMPLE = 0,
+ PNFS_BLOCK_VOLUME_SLICE = 1,
+ PNFS_BLOCK_VOLUME_CONCAT = 2,
+ PNFS_BLOCK_VOLUME_STRIPE = 3,
+};
+
+#define PNFS_BLOCK_MAX_UUIDS 4
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN 128
+
+struct pnfs_block_volume {
+ enum pnfs_block_volume_type type;
+ union {
+ struct {
+ int len;
+ int nr_sigs;
+ struct {
+ u64 offset;
+ u32 sig_len;
+ u8 sig[PNFS_BLOCK_UUID_LEN];
+ } sigs[PNFS_BLOCK_MAX_UUIDS];
+ } simple;
+ struct {
+ u64 start;
+ u64 len;
+ u32 volume;
+ } slice;
+ struct {
+ u32 volumes_count;
+ u32 volumes[MAX_RAID_DEVICES];
+ } concat;
+ struct {
+ u64 chunk_size;
+ u32 volumes_count;
+ u32 volumes[MAX_RAID_DEVICES];
+ } stripe;
+ };
+};
+
+struct pnfs_block_dev_map {
+ sector_t start;
+ sector_t len;
+
+ sector_t disk_offset;
+ struct block_device *bdev;
+};
+
struct pnfs_block_dev {
- struct nfs4_deviceid_node d_node;
- struct block_device *d_bdev;
+ struct nfs4_deviceid_node node;
+
+ u64 start;
+ u64 len;
+
+ u32 nr_children;
+ struct pnfs_block_dev *children;
+ u64 chunk_size;
+
+ struct block_device *bdev;
+ u64 disk_offset;
+
+ bool (*map)(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map);
};

enum exstate4 {
@@ -110,6 +176,11 @@ struct bl_msg_hdr {
#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */

+/* dev.c */
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
/* extent_tree.c */
int ext_tree_insert(struct pnfs_block_layout *bl,
struct pnfs_block_extent *new);
@@ -123,10 +194,8 @@ int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);

/* rpc_pipefs.c */
-struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
- struct pnfs_device *pdev, gfp_t gfp_mask);
-void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
-
+dev_t bl_resolve_deviceid(struct nfs_server *server,
+ struct pnfs_block_volume *b, gfp_t gfp_mask);
int __init bl_init_pipefs(void);
void __exit bl_cleanup_pipefs(void);

diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 0000000..ae18f80
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/blkdev.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "blocklayout.h"
+
+static void
+bl_free_device(struct pnfs_block_dev *dev)
+{
+ if (dev->nr_children) {
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++)
+ bl_free_device(&dev->children[i]);
+ kfree(dev->children);
+ } else {
+ if (dev->bdev)
+ blkdev_put(dev->bdev, FMODE_READ);
+ }
+}
+
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ struct pnfs_block_dev *dev =
+ container_of(d, struct pnfs_block_dev, node);
+
+ bl_free_device(dev);
+ kfree(dev);
+}
+
+static int
+nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+ __be32 *p;
+ int i;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->type = be32_to_cpup(p++);
+
+ switch (b->type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->simple.nr_sigs = be32_to_cpup(p++);
+ if (!b->simple.nr_sigs) {
+ dprintk("no signature\n");
+ return -EIO;
+ }
+
+ b->simple.len = 4 + 4;
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
+ b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
+ if (!p)
+ return -EIO;
+ memcpy(&b->simple.sigs[i].sig, p,
+ b->simple.sigs[i].sig_len);
+
+ b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+ }
+ break;
+ case PNFS_BLOCK_VOLUME_SLICE:
+ p = xdr_inline_decode(xdr, 8 + 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->slice.start);
+ p = xdr_decode_hyper(p, &b->slice.len);
+ b->slice.volume = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ p = xdr_inline_decode(xdr, 4);
+ if (!p)
+ return -EIO;
+ b->concat.volumes_count = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->concat.volumes_count; i++)
+ b->concat.volumes[i] = be32_to_cpup(p++);
+ break;
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ p = xdr_inline_decode(xdr, 8 + 4);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->stripe.chunk_size);
+ b->stripe.volumes_count = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
+ if (!p)
+ return -EIO;
+ for (i = 0; i < b->stripe.volumes_count; i++)
+ b->stripe.volumes[i] = be32_to_cpup(p++);
+ break;
+ default:
+ dprintk("unknown volume type!\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ map->start = dev->start;
+ map->len = dev->len;
+ map->disk_offset = dev->disk_offset;
+ map->bdev = dev->bdev;
+ return true;
+}
+
+static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ int i;
+
+ for (i = 0; i < dev->nr_children; i++) {
+ struct pnfs_block_dev *child = &dev->children[i];
+
+ if (child->start > offset ||
+ child->start + child->len <= offset)
+ continue;
+
+ child->map(child, offset - child->start, map);
+ return true;
+ }
+
+ dprintk("%s: ran off loop!\n", __func__);
+ return false;
+}
+
+static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
+ struct pnfs_block_dev_map *map)
+{
+ struct pnfs_block_dev *child;
+ u64 chunk = (offset / dev->chunk_size);
+ int chunk_idx = chunk % dev->nr_children;
+ u64 disk_offset;
+
+ if (chunk_idx > dev->nr_children) {
+ dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
+ __func__, chunk_idx, offset, dev->chunk_size);
+ /* error, should not happen */
+ return false;
+ }
+
+ /* truncate offset to the beginning of the stripe */
+ offset = chunk * dev->chunk_size;
+
+ /* disk offset of the stripe */
+ disk_offset = offset / dev->nr_children;
+
+ child = &dev->children[chunk_idx];
+ child->map(child, disk_offset, map);
+
+ map->start += offset;
+ map->disk_offset += disk_offset;
+ map->len = dev->chunk_size;
+ return true;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
+
+
+static int
+bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ dev_t dev;
+
+ dev = bl_resolve_deviceid(server, v, gfp_mask);
+ if (!dev)
+ return -EIO;
+
+ d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ if (IS_ERR(d->bdev)) {
+ printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
+ MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
+ return PTR_ERR(d->bdev);
+ }
+
+
+ d->len = i_size_read(d->bdev->bd_inode);
+ d->map = bl_map_simple;
+
+ printk(KERN_INFO "pNFS: using block device %s\n",
+ d->bdev->bd_disk->disk_name);
+ return 0;
+}
+
+static int
+bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ int ret;
+
+ ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
+ if (ret)
+ return ret;
+
+ d->disk_offset = v->slice.start;
+ d->len = v->slice.len;
+ return 0;
+}
+
+static int
+bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->concat.volumes_count,
+ sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->concat.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->concat.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ d->children[i].start += len;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->map = bl_map_concat;
+ return 0;
+}
+
+static int
+bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ u64 len = 0;
+ int ret, i;
+
+ d->children = kcalloc(v->stripe.volumes_count,
+ sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ if (!d->children)
+ return -ENOMEM;
+
+ for (i = 0; i < v->stripe.volumes_count; i++) {
+ ret = bl_parse_deviceid(server, &d->children[i],
+ volumes, v->stripe.volumes[i], gfp_mask);
+ if (ret)
+ return ret;
+
+ d->nr_children++;
+ len += d->children[i].len;
+ }
+
+ d->len = len;
+ d->chunk_size = v->stripe.chunk_size;
+ d->map = bl_map_stripe;
+ return 0;
+}
+
+static int
+bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ switch (volumes[idx].type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ return bl_parse_simple(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_SLICE:
+ return bl_parse_slice(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_CONCAT:
+ return bl_parse_concat(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_STRIPE:
+ return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+ default:
+ dprintk("unsupported volume type: %d\n", volumes[idx].type);
+ return -EIO;
+ }
+}
+
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_mask)
+{
+ struct nfs4_deviceid_node *node = NULL;
+ struct pnfs_block_volume *volumes;
+ struct pnfs_block_dev *top;
+ struct xdr_stream xdr;
+ struct xdr_buf buf;
+ struct page *scratch;
+ int nr_volumes, ret, i;
+ __be32 *p;
+
+ scratch = alloc_page(gfp_mask);
+ if (!scratch)
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
+ xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+
+ p = xdr_inline_decode(&xdr, sizeof(__be32));
+ if (!p)
+ goto out_free_scratch;
+ nr_volumes = be32_to_cpup(p++);
+
+ volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
+ gfp_mask);
+ if (!volumes)
+ goto out_free_scratch;
+
+ for (i = 0; i < nr_volumes; i++) {
+ ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
+ if (ret < 0)
+ goto out_free_volumes;
+ }
+
+ top = kzalloc(sizeof(*top), gfp_mask);
+ if (!top)
+ goto out_free_volumes;
+
+ ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
+ if (ret) {
+ bl_free_device(top);
+ kfree(top);
+ goto out_free_volumes;
+ }
+
+ node = &top->node;
+ nfs4_init_deviceid_node(node, server, &pdev->dev_id);
+
+out_free_volumes:
+ kfree(volumes);
+out_free_scratch:
+ __free_page(scratch);
+out:
+ return node;
+}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index bfb0486..8d04bda 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -34,94 +34,53 @@

#define NFSDBG_FACILITY NFSDBG_PNFS_LD

-static void bl_dm_remove(struct net *net, dev_t dev)
+static void
+nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
{
- struct bl_pipe_msg bl_pipe_msg;
- struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
- struct bl_dev_msg bl_umount_request;
- struct bl_msg_hdr bl_msg = {
- .type = BL_DEVICE_UMOUNT,
- .totallen = sizeof(bl_umount_request),
- };
- uint8_t *dataptr;
- DECLARE_WAITQUEUE(wq, current);
- struct nfs_net *nn = net_generic(net, nfs_net_id);
-
- dprintk("Entering %s\n", __func__);
-
- bl_pipe_msg.bl_wq = &nn->bl_wq;
- memset(msg, 0, sizeof(*msg));
- msg->len = sizeof(bl_msg) + bl_msg.totallen;
- msg->data = kzalloc(msg->len, GFP_NOFS);
- if (!msg->data)
- goto out;
-
- memset(&bl_umount_request, 0, sizeof(bl_umount_request));
- bl_umount_request.major = MAJOR(dev);
- bl_umount_request.minor = MINOR(dev);
-
- memcpy(msg->data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg->data;
- memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
-
- add_wait_queue(&nn->bl_wq, &wq);
- if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
- remove_wait_queue(&nn->bl_wq, &wq);
- goto out;
+ int i;
+
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(b->simple.nr_sigs);
+ for (i = 0; i < b->simple.nr_sigs; i++) {
+ p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
+ p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
+ b->simple.sigs[i].sig_len);
}
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nn->bl_wq, &wq);
-
-out:
- kfree(msg->data);
}

-/*
- * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
- */
-struct nfs4_deviceid_node *
-bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
+dev_t
+bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
gfp_t gfp_mask)
{
- struct pnfs_block_dev *rv;
- struct block_device *bd;
- struct bl_pipe_msg bl_pipe_msg;
- struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
- struct bl_msg_hdr bl_msg = {
- .type = BL_DEVICE_MOUNT,
- .totallen = dev->mincount,
- };
- uint8_t *dataptr;
- DECLARE_WAITQUEUE(wq, current);
- int offset, len, i, rc;
struct net *net = server->nfs_client->cl_net;
struct nfs_net *nn = net_generic(net, nfs_net_id);
struct bl_dev_msg *reply = &nn->bl_mount_reply;
+ struct bl_pipe_msg bl_pipe_msg;
+ struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+ struct bl_msg_hdr *bl_msg;
+ DECLARE_WAITQUEUE(wq, current);
+ dev_t dev = 0;
+ int rc;

dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
- dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
- dev->mincount);

bl_pipe_msg.bl_wq = &nn->bl_wq;
+
+ b->simple.len += 4; /* single volume */
+ if (b->simple.len > PAGE_SIZE)
+ return -EIO;
+
memset(msg, 0, sizeof(*msg));
- msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, gfp_mask);
+ msg->len = sizeof(*bl_msg) + b->simple.len;
+ msg->data = kzalloc(msg->len, gfp_mask);
if (!msg->data)
goto out;

- memcpy(msg->data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg->data;
- len = dev->mincount;
- offset = sizeof(bl_msg);
- for (i = 0; len > 0; i++) {
- memcpy(&dataptr[offset], page_address(dev->pages[i]),
- len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
- len -= PAGE_CACHE_SIZE;
- offset += PAGE_CACHE_SIZE;
- }
- msg->len = sizeof(bl_msg) + dev->mincount;
+ bl_msg = msg->data;
+ bl_msg->type = BL_DEVICE_MOUNT,
+ bl_msg->totallen = b->simple.len;
+ nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);

dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
add_wait_queue(&nn->bl_wq, &wq);
@@ -142,46 +101,10 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
goto out;
}

- bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
- FMODE_READ, NULL);
- if (IS_ERR(bd)) {
- printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n",
- __func__, reply->major, reply->minor,
- PTR_ERR(bd));
- goto out;
- }
-
- rv = kzalloc(sizeof(*rv), gfp_mask);
- if (!rv)
- goto out;
-
- nfs4_init_deviceid_node(&rv->d_node, server, &dev->dev_id);
- rv->d_bdev = bd;
-
- dprintk("%s Created device %s with bd_block_size %u\n",
- __func__,
- bd->bd_disk->disk_name,
- bd->bd_block_size);
-
- kfree(msg->data);
- return &rv->d_node;
-
+ dev = MKDEV(reply->major, reply->minor);
out:
kfree(msg->data);
- return NULL;
-}
-
-void
-bl_free_deviceid_node(struct nfs4_deviceid_node *d)
-{
- struct pnfs_block_dev *dev =
- container_of(d, struct pnfs_block_dev, d_node);
- struct net *net = d->nfs_client->cl_net;
-
- blkdev_put(dev->d_bdev, FMODE_READ);
- bl_dm_remove(net, dev->d_bdev->bd_dev);
-
- kfree(dev);
+ return dev;
}

static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
--
1.9.1


2014-09-03 04:36:34

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 3/6] pnfs/blocklayout: refactor extent processing

Factor out a helper for all per-extent work, and merge the now trivial
functions for lseg allocation and parsing.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 207 ++++++++++++++++++++-------------------
1 file changed, 105 insertions(+), 102 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index df8c567..28a8102 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -513,144 +513,147 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
return 0;
}

-/* XDR decode pnfs_block_layout4 structure */
static int
-nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
+bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
+ struct layout_verification *lv, struct list_head *extents,
+ gfp_t gfp_mask)
{
- struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
- int i, status = -EIO;
- uint32_t count;
- struct pnfs_block_extent *be = NULL, *save;
- struct xdr_stream stream;
- struct xdr_buf buf;
- struct page *scratch;
+ struct pnfs_block_extent *be;
+ struct nfs4_deviceid id;
+ int error;
__be32 *p;
+
+ p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
+ if (!p)
+ return -EIO;
+
+ be = kzalloc(sizeof(*be), GFP_NOFS);
+ if (!be)
+ return -ENOMEM;
+
+ memcpy(&id, p, NFS4_DEVICEID4_SIZE);
+ p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+
+ error = -EIO;
+ be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+ lo->plh_lc_cred, gfp_mask);
+ if (!be->be_device)
+ goto out_free_be;
+
+ /*
+ * The next three values are read in as bytes, but stored in the
+ * extent structure in 512-byte granularity.
+ */
+ if (decode_sector_number(&p, &be->be_f_offset) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_length) < 0)
+ goto out_put_deviceid;
+ if (decode_sector_number(&p, &be->be_v_offset) < 0)
+ goto out_put_deviceid;
+ be->be_state = be32_to_cpup(p++);
+
+ error = verify_extent(be, lv);
+ if (error) {
+ dprintk("%s: extent verification failed\n", __func__);
+ goto out_put_deviceid;
+ }
+
+ list_add_tail(&be->be_list, extents);
+ return 0;
+
+out_put_deviceid:
+ nfs4_put_deviceid_node(be->be_device);
+out_free_be:
+ kfree(be);
+ return error;
+}
+
+static struct pnfs_layout_segment *
+bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_mask)
+{
struct layout_verification lv = {
.mode = lgr->range.iomode,
.start = lgr->range.offset >> SECTOR_SHIFT,
.inval = lgr->range.offset >> SECTOR_SHIFT,
.cowread = lgr->range.offset >> SECTOR_SHIFT,
};
+ struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
+ struct pnfs_layout_segment *lseg;
+ struct xdr_buf buf;
+ struct xdr_stream xdr;
+ struct page *scratch;
+ int status, i;
+ uint32_t count;
+ __be32 *p;
LIST_HEAD(extents);

dprintk("---> %s\n", __func__);

- scratch = alloc_page(gfp_flags);
+ lseg = kzalloc(sizeof(*lseg), gfp_mask);
+ if (!lseg)
+ return ERR_PTR(-ENOMEM);
+
+ status = -ENOMEM;
+ scratch = alloc_page(gfp_mask);
if (!scratch)
- return -ENOMEM;
+ goto out;

- xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+ xdr_init_decode_pages(&xdr, &buf,
+ lgr->layoutp->pages, lgr->layoutp->len);
+ xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);

- p = xdr_inline_decode(&stream, 4);
+ status = -EIO;
+ p = xdr_inline_decode(&xdr, 4);
if (unlikely(!p))
- goto out_err;
+ goto out_free_scratch;

count = be32_to_cpup(p++);
+ dprintk("%s: number of extents %d\n", __func__, count);

- dprintk("%s enter, number of extents %i\n", __func__, count);
- p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
- if (unlikely(!p))
- goto out_err;
-
- /* Decode individual extents, putting them in temporary
- * staging area until whole layout is decoded to make error
- * recovery easier.
+ /*
+ * Decode individual extents, putting them in temporary staging area
+ * until whole layout is decoded to make error recovery easier.
*/
for (i = 0; i < count; i++) {
- struct nfs4_deviceid id;
-
- be = kzalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
- if (!be) {
- status = -ENOMEM;
- goto out_err;
- }
- memcpy(&id, p, NFS4_DEVICEID4_SIZE);
- p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-
- be->be_device =
- nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
- lo->plh_lc_cred, gfp_flags);
- if (!be->be_device)
- goto out_err;
-
- /* The next three values are read in as bytes,
- * but stored as 512-byte sector lengths
- */
- if (decode_sector_number(&p, &be->be_f_offset) < 0)
- goto out_err;
- if (decode_sector_number(&p, &be->be_length) < 0)
- goto out_err;
- if (decode_sector_number(&p, &be->be_v_offset) < 0)
- goto out_err;
- be->be_state = be32_to_cpup(p++);
- if (verify_extent(be, &lv)) {
- dprintk("%s verify failed\n", __func__);
- goto out_err;
- }
- list_add_tail(&be->be_list, &extents);
+ status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
+ if (status)
+ goto process_extents;
}
+
if (lgr->range.offset + lgr->range.length !=
lv.start << SECTOR_SHIFT) {
dprintk("%s Final length mismatch\n", __func__);
- be = NULL;
- goto out_err;
+ status = -EIO;
+ goto process_extents;
}
+
if (lv.start < lv.cowread) {
dprintk("%s Final uncovered COW extent\n", __func__);
- be = NULL;
- goto out_err;
- }
- /* Extents decoded properly, now try to merge them in to
- * existing layout extents.
- */
- list_for_each_entry_safe(be, save, &extents, be_list) {
- list_del(&be->be_list);
-
- status = ext_tree_insert(bl, be);
- if (status)
- goto out_free_list;
+ status = -EIO;
}
- status = 0;
- out:
- __free_page(scratch);
- dprintk("%s returns %i\n", __func__, status);
- return status;

- out_err:
- nfs4_put_deviceid_node(be->be_device);
- kfree(be);
- out_free_list:
+process_extents:
while (!list_empty(&extents)) {
- be = list_first_entry(&extents, struct pnfs_block_extent,
- be_list);
+ struct pnfs_block_extent *be =
+ list_first_entry(&extents, struct pnfs_block_extent,
+ be_list);
list_del(&be->be_list);
- nfs4_put_deviceid_node(be->be_device);
- kfree(be);
- }
- goto out;
-}

-/* We pretty much ignore lseg, and store all data layout wide, so we
- * can correctly merge.
- */
-static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
- struct nfs4_layoutget_res *lgr,
- gfp_t gfp_flags)
-{
- struct pnfs_layout_segment *lseg;
- int status;
+ if (!status)
+ status = ext_tree_insert(bl, be);

- dprintk("%s enter\n", __func__);
- lseg = kzalloc(sizeof(*lseg), gfp_flags);
- if (!lseg)
- return ERR_PTR(-ENOMEM);
- status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
+ if (status) {
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
+ }
+
+out_free_scratch:
+ __free_page(scratch);
+out:
+ dprintk("%s returns %d\n", __func__, status);
if (status) {
- /* We don't want to call the full-blown bl_free_lseg,
- * since on error extents were not touched.
- */
kfree(lseg);
return ERR_PTR(status);
}
--
1.9.1


2014-09-07 17:50:28

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 1/6] pnfs: enable CB_NOTIFY_DEVICEID support

On Tue, Sep 02, 2014 at 09:38:34PM -0700, Christoph Hellwig wrote:
> This code has been around for a while, but never was enabled. Turns out it
> really does work out of the box at least for the block layout driver, so
> we just need to wire it up, and in case of NOTIFY_DEVICEID4_CHANGE remove
> a conditional that returns an error.
>
> Note that we implement NOTIFY_DEVICEID4_CHANGE identical to
> NOTIFY_DEVICEID4_DELETE. Given that in either case we can't do anything
> but preventing further lookups of a given device ID there isn't much difference
> in semantics for the two. For the delete case the server MUST ensure that
> there are no outstanding layouts, while for the change case it doesn't, but
> that has little relevance to the client.

I got a comment that we should probably just enable CB_NOTIFY_DEVICEID
for all layout types unconditionally as there really isn't anything
layout type specific in it. I'm tempted to agree, so if anyone
disagrees speak up now, or I'll resend it that way in a few days.


2014-09-03 04:36:26

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 1/6] pnfs: enable CB_NOTIFY_DEVICEID support

This code has been around for a while, but never was enabled. Turns out it
really does work out of the box at least for the block layout driver, so
we just need to wire it up, and in case of NOTIFY_DEVICEID4_CHANGE remove
a conditional that returns an error.

Note that we implement NOTIFY_DEVICEID4_CHANGE identical to
NOTIFY_DEVICEID4_DELETE. Given that in either case we can't do anything
but preventing further lookups of a given device ID there isn't much difference
in semantics for the two. For the delete case the server MUST ensure that
there are no outstanding layouts, while for the change case it doesn't, but
that has little relevance to the client.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/nfs/callback_proc.c | 3 ---
fs/nfs/nfs4xdr.c | 35 +++++++++++++++++++++++++++--------
fs/nfs/pnfs.h | 2 ++
fs/nfs/pnfs_dev.c | 1 +
4 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 86541e0..1cdd345 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -287,9 +287,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
}

found:
- if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
- dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
- "deleting instead\n", __func__);
nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
}

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f2cd957..65864ec 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -373,14 +373,19 @@ static int nfs4_stat_to_errno(int);
NFS4_DEVICEID4_SIZE) \
/* gdlr_deviceid_list */ + \
1 /* bool gdlr_eof */)
-#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
- XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
+#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
+ 1 /* layout type */ + \
+ 1 /* maxcount */ + \
+ 1 /* bitmap size */ + \
+ 1 /* notification bitmap length */ + \
+ 1 /* notification bitmap, word 0 */)
#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
1 /* layout type */ + \
1 /* opaque devaddr4 length */ + \
/* devaddr4 payload is read into page */ \
1 /* notification bitmap length */ + \
- 1 /* notification bitmap */)
+ 1 /* notification bitmap, word 0 */)
#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
encode_stateid_maxsz)
#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
@@ -1955,12 +1960,20 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
__be32 *p;

encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
- p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE);
+ p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(args->pdev->layout_type);
*p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
- *p++ = cpu_to_be32(0); /* bitmap length 0 */
+
+ if (args->pdev->notify_types) {
+ p = reserve_space(xdr, 4 + 4);
+ *p++ = cpu_to_be32(1); /* bitmap length */
+ *p++ = cpu_to_be32(args->pdev->notify_types);
+ } else {
+ p = reserve_space(xdr, 4);
+ *p++ = cpu_to_be32(0);
+ }
}

static void
@@ -5870,9 +5883,15 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 4 * len);
if (unlikely(!p))
goto out_overflow;
- for (i = 0; i < len; i++, p++) {
- if (be32_to_cpup(p)) {
- dprintk("%s: notifications not supported\n",
+
+ if (be32_to_cpup(p++) & ~pdev->notify_types) {
+ dprintk("%s: no notification support\n",
+ __func__);
+ }
+
+ for (i = 1; i < len; i++) {
+ if (be32_to_cpup(p++)) {
+ dprintk("%s: unsupported notification\n",
__func__);
return -EIO;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4746ede..a5d3067 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -84,6 +84,7 @@ struct pnfs_layoutdriver_type {
const char *name;
struct module *owner;
unsigned flags;
+ unsigned notify_types;
unsigned max_deviceinfo_size;

int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
@@ -161,6 +162,7 @@ struct pnfs_layout_hdr {
struct pnfs_device {
struct nfs4_deviceid dev_id;
unsigned int layout_type;
+ unsigned int notify_types;
unsigned int mincount;
unsigned int maxcount; /* gdia_maxcount */
struct page **pages;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 82c2836..d44334a 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -132,6 +132,7 @@ nfs4_get_device_info(struct nfs_server *server,

memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
pdev->layout_type = server->pnfs_curr_ld->id;
+ pdev->notify_types = server->pnfs_curr_ld->notify_types;
pdev->pages = pages;
pdev->pgbase = 0;
pdev->pglen = max_resp_sz;
--
1.9.1


2014-09-03 19:31:17

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH 5/6] pnfs/blocklayout: in-kernel GETDEVICEINFO XDR parsing

Hey Christoph,


On 09/03/2014 12:38 AM, Christoph Hellwig wrote:
> This patches moves parsing of the GETDEVICEINFO XDR to kernel space, as well
> as the management of complex devices. The reason for that is we might have
> multiple outstanding complex devices after a NOTIFY_DEVICEID4_CHANGE, which
> device mapper or md can't handle as they claim devices exclusively.
>
> But as is turns out simple striping / concatenation is fairly trivial to
> implement anyway, so we make our life simpler by reducing the reliance
> on blkmapd. For now we still use blkmapd by feeding it synthetic SIMPLE
> device XDR to translate device signatures to device numbers, but in the
> long runs I have plans to eliminate it entirely.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/nfs/blocklayout/Makefile | 2 +-
> fs/nfs/blocklayout/blocklayout.c | 92 ++++++----
> fs/nfs/blocklayout/blocklayout.h | 81 ++++++++-
> fs/nfs/blocklayout/dev.c | 358 +++++++++++++++++++++++++++++++++++++++
> fs/nfs/blocklayout/rpc_pipefs.c | 141 ++++-----------
> 5 files changed, 526 insertions(+), 148 deletions(-)
> create mode 100644 fs/nfs/blocklayout/dev.c
>
> diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
> index e177026..3ca14c3 100644
> --- a/fs/nfs/blocklayout/Makefile
> +++ b/fs/nfs/blocklayout/Makefile
> @@ -3,4 +3,4 @@
> #
> obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
>
> -blocklayoutdriver-y += blocklayout.o extent_tree.o rpc_pipefs.o
> +blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 7b3c8c9..e92591c 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -114,13 +114,10 @@ bl_submit_bio(int rw, struct bio *bio)
> return NULL;
> }
>
> -static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
> - struct pnfs_block_extent *be,
> - void (*end_io)(struct bio *, int err),
> - struct parallel_io *par)
> +static struct bio *
> +bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
> + void (*end_io)(struct bio *, int err), struct parallel_io *par)
> {
> - struct pnfs_block_dev *dev =
> - container_of(be->be_device, struct pnfs_block_dev, d_node);
> struct bio *bio;
>
> npg = min(npg, BIO_MAX_PAGES);
> @@ -131,32 +128,55 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
> }
>
> if (bio) {
> - bio->bi_iter.bi_sector = isect - be->be_f_offset +
> - be->be_v_offset;
> - bio->bi_bdev = dev->d_bdev;
> + bio->bi_iter.bi_sector = disk_sector;
> + bio->bi_bdev = bdev;
> bio->bi_end_io = end_io;
> bio->bi_private = par;
> }
> return bio;
> }
>
> -static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
> - sector_t isect, struct page *page,
> - struct pnfs_block_extent *be,
> - void (*end_io)(struct bio *, int err),
> - struct parallel_io *par,
> - unsigned int offset, int len)
> +static struct bio *
> +do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
> + struct page *page, struct pnfs_block_dev_map *map,
> + struct pnfs_block_extent *be,
> + void (*end_io)(struct bio *, int err),
> + struct parallel_io *par, unsigned int offset, int *len)
> {
> - isect = isect + (offset >> SECTOR_SHIFT);
> + struct pnfs_block_dev *dev =
> + container_of(be->be_device, struct pnfs_block_dev, node);
> + u64 disk_addr, end;
> +
> dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
> - npg, rw, (unsigned long long)isect, offset, len);
> + npg, rw, (unsigned long long)isect, offset, *len);
> +
> + /* translate to device offset */
> + isect += be->be_v_offset;
> + isect -= be->be_f_offset;
> +
> + /* translate to physical disk offset */
> + disk_addr = (u64)isect << SECTOR_SHIFT;
> + if (disk_addr < map->start || disk_addr >= map->start + map->len) {
> + if (!dev->map(dev, disk_addr, map))
> + return ERR_PTR(-EIO);
> + bio = bl_submit_bio(rw, bio);
> + }
> + disk_addr += map->disk_offset;
> + disk_addr -= map->start;
> +
> + /* limit length to what the device mapping allows */
> + end = disk_addr + *len;
> + if (end >= map->start + map->len)
> + *len = map->start + map->len - disk_addr;
> +
> retry:
> if (!bio) {
> - bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
> + bio = bl_alloc_init_bio(npg, map->bdev,
> + disk_addr >> SECTOR_SHIFT, end_io, par);
> if (!bio)
> return ERR_PTR(-ENOMEM);
> }
> - if (bio_add_page(bio, page, len, offset) < len) {
> + if (bio_add_page(bio, page, *len, offset) < *len) {
> bio = bl_submit_bio(rw, bio);
> goto retry;
> }
> @@ -203,6 +223,7 @@ static enum pnfs_try_status
> bl_read_pagelist(struct nfs_pgio_header *header)
> {
> struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
> + struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
> struct bio *bio = NULL;
> struct pnfs_block_extent be;
> sector_t isect, extent_length = 0;
> @@ -248,28 +269,29 @@ bl_read_pagelist(struct nfs_pgio_header *header)
> pg_len = PAGE_CACHE_SIZE - pg_offset;
> else
> pg_len = bytes_left;
> -
> - f_offset += pg_len;
> - bytes_left -= pg_len;
> - isect += (pg_offset >> SECTOR_SHIFT);
> - extent_length -= (pg_offset >> SECTOR_SHIFT);
> } else {
> BUG_ON(pg_offset != 0);
> pg_len = PAGE_CACHE_SIZE;
> }
>
> + isect += (pg_offset >> SECTOR_SHIFT);
> + extent_length -= (pg_offset >> SECTOR_SHIFT);
> +
> if (is_hole(&be)) {
> bio = bl_submit_bio(READ, bio);
> /* Fill hole w/ zeroes w/o accessing device */
> dprintk("%s Zeroing page for hole\n", __func__);
> zero_user_segment(pages[i], pg_offset, pg_len);
> +
> + /* invalidate map */
> + map.start = NFS4_MAX_UINT64;
> } else {
> bio = do_add_page_to_bio(bio,
> header->page_array.npages - i,
> READ,
> - isect, pages[i], &be,
> + isect, pages[i], &map, &be,
> bl_end_io_read, par,
> - pg_offset, pg_len);
> + pg_offset, &pg_len);
> if (IS_ERR(bio)) {
> header->pnfs_error = PTR_ERR(bio);
> bio = NULL;
> @@ -278,6 +300,8 @@ bl_read_pagelist(struct nfs_pgio_header *header)
> }
> isect += (pg_len >> SECTOR_SHIFT);
> extent_length -= (pg_len >> SECTOR_SHIFT);
> + f_offset += pg_len;
> + bytes_left -= pg_len;
> }
> if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
> header->res.eof = 1;
> @@ -346,6 +370,7 @@ static enum pnfs_try_status
> bl_write_pagelist(struct nfs_pgio_header *header, int sync)
> {
> struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
> + struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
> struct bio *bio = NULL;
> struct pnfs_block_extent be;
> sector_t isect, extent_length = 0;
> @@ -354,6 +379,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
> size_t count = header->args.count;
> struct page **pages = header->args.pages;
> int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
> + unsigned int pg_len;
> struct blk_plug plug;
> int i;
>
> @@ -387,19 +413,21 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
> extent_length = be.be_length - (isect - be.be_f_offset);
> }
>
> + pg_len = PAGE_CACHE_SIZE;
> bio = do_add_page_to_bio(bio, header->page_array.npages - i,
> - WRITE, isect, pages[i], &be,
> + WRITE, isect, pages[i], &map, &be,
> bl_end_io_write, par,
> - 0, PAGE_CACHE_SIZE);
> + 0, &pg_len);
> if (IS_ERR(bio)) {
> header->pnfs_error = PTR_ERR(bio);
> bio = NULL;
> goto out;
> }
> - offset += PAGE_CACHE_SIZE;
> - count -= PAGE_CACHE_SIZE;
> - isect += PAGE_CACHE_SECTORS;
> - extent_length -= PAGE_CACHE_SECTORS;
> +
> + offset += pg_len;
> + count -= pg_len;
> + isect += (pg_len >> SECTOR_SHIFT);
> + extent_length -= (pg_len >> SECTOR_SHIFT);
> }
>
> header->res.count = header->args.count;
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index c98d98a..3077391 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -44,9 +44,75 @@
> #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
> #define SECTOR_SIZE (1 << SECTOR_SHIFT)
>
> +struct pnfs_block_dev;
> +
> +enum pnfs_block_volume_type {
> + PNFS_BLOCK_VOLUME_SIMPLE = 0,
> + PNFS_BLOCK_VOLUME_SLICE = 1,
> + PNFS_BLOCK_VOLUME_CONCAT = 2,
> + PNFS_BLOCK_VOLUME_STRIPE = 3,
> +};
> +
> +#define PNFS_BLOCK_MAX_UUIDS 4
> +
> +/*
> + * Random upper cap for the uuid length to avoid unbounded allocation.
> + * Not actually limited by the protocol.
> + */
> +#define PNFS_BLOCK_UUID_LEN 128
> +
> +struct pnfs_block_volume {
> + enum pnfs_block_volume_type type;
> + union {
> + struct {
> + int len;
> + int nr_sigs;
> + struct {
> + u64 offset;
> + u32 sig_len;
> + u8 sig[PNFS_BLOCK_UUID_LEN];
> + } sigs[PNFS_BLOCK_MAX_UUIDS];
> + } simple;
> + struct {
> + u64 start;
> + u64 len;
> + u32 volume;
> + } slice;
> + struct {
> + u32 volumes_count;
> + u32 volumes[MAX_RAID_DEVICES];
> + } concat;
> + struct {
> + u64 chunk_size;
> + u32 volumes_count;
> + u32 volumes[MAX_RAID_DEVICES];
> + } stripe;
> + };
> +};
> +
> +struct pnfs_block_dev_map {
> + sector_t start;
> + sector_t len;
> +
> + sector_t disk_offset;
> + struct block_device *bdev;
> +};
> +
> struct pnfs_block_dev {
> - struct nfs4_deviceid_node d_node;
> - struct block_device *d_bdev;
> + struct nfs4_deviceid_node node;
> +
> + u64 start;
> + u64 len;
> +
> + u32 nr_children;
> + struct pnfs_block_dev *children;
> + u64 chunk_size;
> +
> + struct block_device *bdev;
> + u64 disk_offset;
> +
> + bool (*map)(struct pnfs_block_dev *dev, u64 offset,
> + struct pnfs_block_dev_map *map);
> };
>
> enum exstate4 {
> @@ -110,6 +176,11 @@ struct bl_msg_hdr {
> #define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
> #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
>
> +/* dev.c */
> +struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
> + struct pnfs_device *pdev, gfp_t gfp_mask);
> +void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
> +
> /* extent_tree.c */
> int ext_tree_insert(struct pnfs_block_layout *bl,
> struct pnfs_block_extent *new);
> @@ -123,10 +194,8 @@ int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
> void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
>
> /* rpc_pipefs.c */
> -struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
> - struct pnfs_device *pdev, gfp_t gfp_mask);
> -void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
> -
> +dev_t bl_resolve_deviceid(struct nfs_server *server,
> + struct pnfs_block_volume *b, gfp_t gfp_mask);
> int __init bl_init_pipefs(void);
> void __exit bl_cleanup_pipefs(void);
>
> diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
> new file mode 100644
> index 0000000..ae18f80
> --- /dev/null
> +++ b/fs/nfs/blocklayout/dev.c
> @@ -0,0 +1,358 @@
> +/*
> + * Copyright (c) 2014 Christoph Hellwig.
> + */
> +#include <linux/sunrpc/svc.h>
> +#include <linux/blkdev.h>
> +#include <linux/nfs4.h>
> +#include <linux/nfs_fs.h>
> +#include <linux/nfs_xdr.h>
> +
> +#include "blocklayout.h"
> +
> +static void
> +bl_free_device(struct pnfs_block_dev *dev)
> +{
> + if (dev->nr_children) {
> + int i;
> +
> + for (i = 0; i < dev->nr_children; i++)
> + bl_free_device(&dev->children[i]);
> + kfree(dev->children);
> + } else {
> + if (dev->bdev)
> + blkdev_put(dev->bdev, FMODE_READ);

else if (dev->bdev)? :)

> + }
> +}
> +
> +void
> +bl_free_deviceid_node(struct nfs4_deviceid_node *d)
> +{
> + struct pnfs_block_dev *dev =
> + container_of(d, struct pnfs_block_dev, node);
> +
> + bl_free_device(dev);
> + kfree(dev);
> +}
> +
> +static int
> +nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
> +{
> + __be32 *p;
> + int i;
> +
> + p = xdr_inline_decode(xdr, 4);
> + if (!p)
> + return -EIO;
> + b->type = be32_to_cpup(p++);
> +
> + switch (b->type) {
> + case PNFS_BLOCK_VOLUME_SIMPLE:
> + p = xdr_inline_decode(xdr, 4);
> + if (!p)
> + return -EIO;
> + b->simple.nr_sigs = be32_to_cpup(p++);
> + if (!b->simple.nr_sigs) {
> + dprintk("no signature\n");
> + return -EIO;
> + }
> +
> + b->simple.len = 4 + 4;
> + for (i = 0; i < b->simple.nr_sigs; i++) {
> + p = xdr_inline_decode(xdr, 8 + 4);
> + if (!p)
> + return -EIO;
> + p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
> + b->simple.sigs[i].sig_len = be32_to_cpup(p++);
> +
> + p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
> + if (!p)
> + return -EIO;
> + memcpy(&b->simple.sigs[i].sig, p,
> + b->simple.sigs[i].sig_len);
> +
> + b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
> + }
> + break;
> + case PNFS_BLOCK_VOLUME_SLICE:
> + p = xdr_inline_decode(xdr, 8 + 8 + 4);
> + if (!p)
> + return -EIO;
> + p = xdr_decode_hyper(p, &b->slice.start);
> + p = xdr_decode_hyper(p, &b->slice.len);
> + b->slice.volume = be32_to_cpup(p++);
> + break;
> + case PNFS_BLOCK_VOLUME_CONCAT:
> + p = xdr_inline_decode(xdr, 4);
> + if (!p)
> + return -EIO;
> + b->concat.volumes_count = be32_to_cpup(p++);
> +
> + p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
> + if (!p)
> + return -EIO;
> + for (i = 0; i < b->concat.volumes_count; i++)
> + b->concat.volumes[i] = be32_to_cpup(p++);
> + break;
> + case PNFS_BLOCK_VOLUME_STRIPE:
> + p = xdr_inline_decode(xdr, 8 + 4);
> + if (!p)
> + return -EIO;
> + p = xdr_decode_hyper(p, &b->stripe.chunk_size);
> + b->stripe.volumes_count = be32_to_cpup(p++);
> +
> + p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
> + if (!p)
> + return -EIO;
> + for (i = 0; i < b->stripe.volumes_count; i++)
> + b->stripe.volumes[i] = be32_to_cpup(p++);
> + break;
> + default:
> + dprintk("unknown volume type!\n");
> + return -EIO;
> + }

Can you make each of these cases a helper function?

> +
> + return 0;
> +}
> +
> +static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
> + struct pnfs_block_dev_map *map)
> +{
> + map->start = dev->start;
> + map->len = dev->len;
> + map->disk_offset = dev->disk_offset;
> + map->bdev = dev->bdev;
> + return true;
> +}
> +
> +static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
> + struct pnfs_block_dev_map *map)
> +{
> + int i;
> +
> + for (i = 0; i < dev->nr_children; i++) {
> + struct pnfs_block_dev *child = &dev->children[i];
> +
> + if (child->start > offset ||
> + child->start + child->len <= offset)
> + continue;
> +
> + child->map(child, offset - child->start, map);
> + return true;
> + }
> +
> + dprintk("%s: ran off loop!\n", __func__);
> + return false;
> +}
> +
> +static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
> + struct pnfs_block_dev_map *map)
> +{
> + struct pnfs_block_dev *child;
> + u64 chunk = (offset / dev->chunk_size);
> + int chunk_idx = chunk % dev->nr_children;
> + u64 disk_offset;
> +
> + if (chunk_idx > dev->nr_children) {
> + dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
> + __func__, chunk_idx, offset, dev->chunk_size);
> + /* error, should not happen */
> + return false;
> + }
> +
> + /* truncate offset to the beginning of the stripe */
> + offset = chunk * dev->chunk_size;
> +
> + /* disk offset of the stripe */
> + disk_offset = offset / dev->nr_children;
> +
> + child = &dev->children[chunk_idx];
> + child->map(child, disk_offset, map);
> +
> + map->start += offset;
> + map->disk_offset += disk_offset;
> + map->len = dev->chunk_size;
> + return true;
> +}
> +
> +static int
> +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
> + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);

Why put the declaration in the middle of the file?

Anna

> +
> +
> +static int
> +bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
> + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> + struct pnfs_block_volume *v = &volumes[idx];
> + dev_t dev;
> +
> + dev = bl_resolve_deviceid(server, v, gfp_mask);
> + if (!dev)
> + return -EIO;
> +
> + d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
> + if (IS_ERR(d->bdev)) {
> + printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
> + MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
> + return PTR_ERR(d->bdev);
> + }
> +
> +
> + d->len = i_size_read(d->bdev->bd_inode);
> + d->map = bl_map_simple;
> +
> + printk(KERN_INFO "pNFS: using block device %s\n",
> + d->bdev->bd_disk->disk_name);
> + return 0;
> +}
> +
> +static int
> +bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
> + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> + struct pnfs_block_volume *v = &volumes[idx];
> + int ret;
> +
> + ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
> + if (ret)
> + return ret;
> +
> + d->disk_offset = v->slice.start;
> + d->len = v->slice.len;
> + return 0;
> +}
> +
> +static int
> +bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
> + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> + struct pnfs_block_volume *v = &volumes[idx];
> + u64 len = 0;
> + int ret, i;
> +
> + d->children = kcalloc(v->concat.volumes_count,
> + sizeof(struct pnfs_block_dev), GFP_KERNEL);
> + if (!d->children)
> + return -ENOMEM;
> +
> + for (i = 0; i < v->concat.volumes_count; i++) {
> + ret = bl_parse_deviceid(server, &d->children[i],
> + volumes, v->concat.volumes[i], gfp_mask);
> + if (ret)
> + return ret;
> +
> + d->nr_children++;
> + d->children[i].start += len;
> + len += d->children[i].len;
> + }
> +
> + d->len = len;
> + d->map = bl_map_concat;
> + return 0;
> +}
> +
> +static int
> +bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
> + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> + struct pnfs_block_volume *v = &volumes[idx];
> + u64 len = 0;
> + int ret, i;
> +
> + d->children = kcalloc(v->stripe.volumes_count,
> + sizeof(struct pnfs_block_dev), GFP_KERNEL);
> + if (!d->children)
> + return -ENOMEM;
> +
> + for (i = 0; i < v->stripe.volumes_count; i++) {
> + ret = bl_parse_deviceid(server, &d->children[i],
> + volumes, v->stripe.volumes[i], gfp_mask);
> + if (ret)
> + return ret;
> +
> + d->nr_children++;
> + len += d->children[i].len;
> + }
> +
> + d->len = len;
> + d->chunk_size = v->stripe.chunk_size;
> + d->map = bl_map_stripe;
> + return 0;
> +}
> +
> +static int
> +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
> + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> + switch (volumes[idx].type) {
> + case PNFS_BLOCK_VOLUME_SIMPLE:
> + return bl_parse_simple(server, d, volumes, idx, gfp_mask);
> + case PNFS_BLOCK_VOLUME_SLICE:
> + return bl_parse_slice(server, d, volumes, idx, gfp_mask);
> + case PNFS_BLOCK_VOLUME_CONCAT:
> + return bl_parse_concat(server, d, volumes, idx, gfp_mask);
> + case PNFS_BLOCK_VOLUME_STRIPE:
> + return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
> + default:
> + dprintk("unsupported volume type: %d\n", volumes[idx].type);
> + return -EIO;
> + }
> +}
> +
> +struct nfs4_deviceid_node *
> +bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
> + gfp_t gfp_mask)
> +{
> + struct nfs4_deviceid_node *node = NULL;
> + struct pnfs_block_volume *volumes;
> + struct pnfs_block_dev *top;
> + struct xdr_stream xdr;
> + struct xdr_buf buf;
> + struct page *scratch;
> + int nr_volumes, ret, i;
> + __be32 *p;
> +
> + scratch = alloc_page(gfp_mask);
> + if (!scratch)
> + goto out;
> +
> + xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
> + xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
> +
> + p = xdr_inline_decode(&xdr, sizeof(__be32));
> + if (!p)
> + goto out_free_scratch;
> + nr_volumes = be32_to_cpup(p++);
> +
> + volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
> + gfp_mask);
> + if (!volumes)
> + goto out_free_scratch;
> +
> + for (i = 0; i < nr_volumes; i++) {
> + ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
> + if (ret < 0)
> + goto out_free_volumes;
> + }
> +
> + top = kzalloc(sizeof(*top), gfp_mask);
> + if (!top)
> + goto out_free_volumes;
> +
> + ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
> + if (ret) {
> + bl_free_device(top);
> + kfree(top);
> + goto out_free_volumes;
> + }
> +
> + node = &top->node;
> + nfs4_init_deviceid_node(node, server, &pdev->dev_id);
> +
> +out_free_volumes:
> + kfree(volumes);
> +out_free_scratch:
> + __free_page(scratch);
> +out:
> + return node;
> +}
> diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
> index bfb0486..8d04bda 100644
> --- a/fs/nfs/blocklayout/rpc_pipefs.c
> +++ b/fs/nfs/blocklayout/rpc_pipefs.c
> @@ -34,94 +34,53 @@
>
> #define NFSDBG_FACILITY NFSDBG_PNFS_LD
>
> -static void bl_dm_remove(struct net *net, dev_t dev)
> +static void
> +nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
> {
> - struct bl_pipe_msg bl_pipe_msg;
> - struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
> - struct bl_dev_msg bl_umount_request;
> - struct bl_msg_hdr bl_msg = {
> - .type = BL_DEVICE_UMOUNT,
> - .totallen = sizeof(bl_umount_request),
> - };
> - uint8_t *dataptr;
> - DECLARE_WAITQUEUE(wq, current);
> - struct nfs_net *nn = net_generic(net, nfs_net_id);
> -
> - dprintk("Entering %s\n", __func__);
> -
> - bl_pipe_msg.bl_wq = &nn->bl_wq;
> - memset(msg, 0, sizeof(*msg));
> - msg->len = sizeof(bl_msg) + bl_msg.totallen;
> - msg->data = kzalloc(msg->len, GFP_NOFS);
> - if (!msg->data)
> - goto out;
> -
> - memset(&bl_umount_request, 0, sizeof(bl_umount_request));
> - bl_umount_request.major = MAJOR(dev);
> - bl_umount_request.minor = MINOR(dev);
> -
> - memcpy(msg->data, &bl_msg, sizeof(bl_msg));
> - dataptr = (uint8_t *) msg->data;
> - memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
> -
> - add_wait_queue(&nn->bl_wq, &wq);
> - if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
> - remove_wait_queue(&nn->bl_wq, &wq);
> - goto out;
> + int i;
> +
> + *p++ = cpu_to_be32(1);
> + *p++ = cpu_to_be32(b->type);
> + *p++ = cpu_to_be32(b->simple.nr_sigs);
> + for (i = 0; i < b->simple.nr_sigs; i++) {
> + p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
> + p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
> + b->simple.sigs[i].sig_len);
> }
> -
> - set_current_state(TASK_UNINTERRUPTIBLE);
> - schedule();
> - __set_current_state(TASK_RUNNING);
> - remove_wait_queue(&nn->bl_wq, &wq);
> -
> -out:
> - kfree(msg->data);
> }
>
> -/*
> - * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
> - */
> -struct nfs4_deviceid_node *
> -bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
> +dev_t
> +bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
> gfp_t gfp_mask)
> {
> - struct pnfs_block_dev *rv;
> - struct block_device *bd;
> - struct bl_pipe_msg bl_pipe_msg;
> - struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
> - struct bl_msg_hdr bl_msg = {
> - .type = BL_DEVICE_MOUNT,
> - .totallen = dev->mincount,
> - };
> - uint8_t *dataptr;
> - DECLARE_WAITQUEUE(wq, current);
> - int offset, len, i, rc;
> struct net *net = server->nfs_client->cl_net;
> struct nfs_net *nn = net_generic(net, nfs_net_id);
> struct bl_dev_msg *reply = &nn->bl_mount_reply;
> + struct bl_pipe_msg bl_pipe_msg;
> + struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
> + struct bl_msg_hdr *bl_msg;
> + DECLARE_WAITQUEUE(wq, current);
> + dev_t dev = 0;
> + int rc;
>
> dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
> - dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
> - dev->mincount);
>
> bl_pipe_msg.bl_wq = &nn->bl_wq;
> +
> + b->simple.len += 4; /* single volume */
> + if (b->simple.len > PAGE_SIZE)
> + return -EIO;
> +
> memset(msg, 0, sizeof(*msg));
> - msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, gfp_mask);
> + msg->len = sizeof(*bl_msg) + b->simple.len;
> + msg->data = kzalloc(msg->len, gfp_mask);
> if (!msg->data)
> goto out;
>
> - memcpy(msg->data, &bl_msg, sizeof(bl_msg));
> - dataptr = (uint8_t *) msg->data;
> - len = dev->mincount;
> - offset = sizeof(bl_msg);
> - for (i = 0; len > 0; i++) {
> - memcpy(&dataptr[offset], page_address(dev->pages[i]),
> - len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
> - len -= PAGE_CACHE_SIZE;
> - offset += PAGE_CACHE_SIZE;
> - }
> - msg->len = sizeof(bl_msg) + dev->mincount;
> + bl_msg = msg->data;
> + bl_msg->type = BL_DEVICE_MOUNT,
> + bl_msg->totallen = b->simple.len;
> + nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
>
> dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
> add_wait_queue(&nn->bl_wq, &wq);
> @@ -142,46 +101,10 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
> goto out;
> }
>
> - bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
> - FMODE_READ, NULL);
> - if (IS_ERR(bd)) {
> - printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n",
> - __func__, reply->major, reply->minor,
> - PTR_ERR(bd));
> - goto out;
> - }
> -
> - rv = kzalloc(sizeof(*rv), gfp_mask);
> - if (!rv)
> - goto out;
> -
> - nfs4_init_deviceid_node(&rv->d_node, server, &dev->dev_id);
> - rv->d_bdev = bd;
> -
> - dprintk("%s Created device %s with bd_block_size %u\n",
> - __func__,
> - bd->bd_disk->disk_name,
> - bd->bd_block_size);
> -
> - kfree(msg->data);
> - return &rv->d_node;
> -
> + dev = MKDEV(reply->major, reply->minor);
> out:
> kfree(msg->data);
> - return NULL;
> -}
> -
> -void
> -bl_free_deviceid_node(struct nfs4_deviceid_node *d)
> -{
> - struct pnfs_block_dev *dev =
> - container_of(d, struct pnfs_block_dev, d_node);
> - struct net *net = d->nfs_client->cl_net;
> -
> - blkdev_put(dev->d_bdev, FMODE_READ);
> - bl_dm_remove(net, dev->d_bdev->bd_dev);
> -
> - kfree(dev);
> + return dev;
> }
>
> static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,


2014-09-03 04:36:47

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 6/6] pnfs/blocklayout: ask for NOTIFY_DEVICEID callbacks

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index e92591c..30d6120 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -875,6 +875,8 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.owner = THIS_MODULE,
.flags = PNFS_LAYOUTRET_ON_SETATTR |
PNFS_READ_WHOLE_PAGE,
+ .notify_types = NOTIFY_DEVICEID4_CHANGE |
+ NOTIFY_DEVICEID4_DELETE;
.read_pagelist = bl_read_pagelist,
.write_pagelist = bl_write_pagelist,
.alloc_layout_hdr = bl_alloc_layout_hdr,
--
1.9.1


2014-09-04 01:38:24

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 5/6] pnfs/blocklayout: in-kernel GETDEVICEINFO XDR parsing

> > +static void
> > +bl_free_device(struct pnfs_block_dev *dev)
> > +{
> > + if (dev->nr_children) {
> > + int i;
> > +
> > + for (i = 0; i < dev->nr_children; i++)
> > + bl_free_device(&dev->children[i]);
> > + kfree(dev->children);
> > + } else {
> > + if (dev->bdev)
> > + blkdev_put(dev->bdev, FMODE_READ);
>
> else if (dev->bdev)? :)

This was ver ymuch intentional to make it clear there's a leaf device and
non-leaf device case.

> Can you make each of these cases a helper function?

I could - in fact I had it that way earlier, but it increased code size and
decreased readability so I merged it back together.

> > +static int
> > +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
> > + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
>
> Why put the declaration in the middle of the file?

No good reason - this is about where we start to need it. I can move
it to the top.


2014-09-03 04:36:38

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 4/6] pnfs/blocklayout: move all rpc_pipefs related code into a single file

Create a file to house all the rpc_pipefs boilerplate code instead of
sprinkling it over a few files.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/nfs/blocklayout/Makefile | 4 +-
fs/nfs/blocklayout/blocklayout.c | 145 +--------------
fs/nfs/blocklayout/blocklayout.h | 19 +-
fs/nfs/blocklayout/blocklayoutdev.c | 172 -----------------
fs/nfs/blocklayout/blocklayoutdm.c | 84 ---------
fs/nfs/blocklayout/rpc_pipefs.c | 362 ++++++++++++++++++++++++++++++++++++
6 files changed, 378 insertions(+), 408 deletions(-)
delete mode 100644 fs/nfs/blocklayout/blocklayoutdev.c
delete mode 100644 fs/nfs/blocklayout/blocklayoutdm.c
create mode 100644 fs/nfs/blocklayout/rpc_pipefs.c

diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index 3fa5ec7..e177026 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,5 +2,5 @@
# Makefile for the pNFS block layout driver kernel module
#
obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
-blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \
- extent_tree.o
+
+blocklayoutdriver-y += blocklayout.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 28a8102..7b3c8c9 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -863,132 +863,6 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.pg_write_ops = &bl_pg_write_ops,
};

-static const struct rpc_pipe_ops bl_upcall_ops = {
- .upcall = rpc_pipe_generic_upcall,
- .downcall = bl_pipe_downcall,
- .destroy_msg = bl_pipe_destroy_msg,
-};
-
-static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
- struct rpc_pipe *pipe)
-{
- struct dentry *dir, *dentry;
-
- dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
- if (dir == NULL)
- return ERR_PTR(-ENOENT);
- dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
- dput(dir);
- return dentry;
-}
-
-static void nfs4blocklayout_unregister_sb(struct super_block *sb,
- struct rpc_pipe *pipe)
-{
- if (pipe->dentry)
- rpc_unlink(pipe->dentry);
-}
-
-static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
- void *ptr)
-{
- struct super_block *sb = ptr;
- struct net *net = sb->s_fs_info;
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
- int ret = 0;
-
- if (!try_module_get(THIS_MODULE))
- return 0;
-
- if (nn->bl_device_pipe == NULL) {
- module_put(THIS_MODULE);
- return 0;
- }
-
- switch (event) {
- case RPC_PIPEFS_MOUNT:
- dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
- break;
- }
- nn->bl_device_pipe->dentry = dentry;
- break;
- case RPC_PIPEFS_UMOUNT:
- if (nn->bl_device_pipe->dentry)
- nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
- break;
- default:
- ret = -ENOTSUPP;
- break;
- }
- module_put(THIS_MODULE);
- return ret;
-}
-
-static struct notifier_block nfs4blocklayout_block = {
- .notifier_call = rpc_pipefs_event,
-};
-
-static struct dentry *nfs4blocklayout_register_net(struct net *net,
- struct rpc_pipe *pipe)
-{
- struct super_block *pipefs_sb;
- struct dentry *dentry;
-
- pipefs_sb = rpc_get_sb_net(net);
- if (!pipefs_sb)
- return NULL;
- dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
- rpc_put_sb_net(net);
- return dentry;
-}
-
-static void nfs4blocklayout_unregister_net(struct net *net,
- struct rpc_pipe *pipe)
-{
- struct super_block *pipefs_sb;
-
- pipefs_sb = rpc_get_sb_net(net);
- if (pipefs_sb) {
- nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
- rpc_put_sb_net(net);
- }
-}
-
-static int nfs4blocklayout_net_init(struct net *net)
-{
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
-
- init_waitqueue_head(&nn->bl_wq);
- nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
- if (IS_ERR(nn->bl_device_pipe))
- return PTR_ERR(nn->bl_device_pipe);
- dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
- rpc_destroy_pipe_data(nn->bl_device_pipe);
- return PTR_ERR(dentry);
- }
- nn->bl_device_pipe->dentry = dentry;
- return 0;
-}
-
-static void nfs4blocklayout_net_exit(struct net *net)
-{
- struct nfs_net *nn = net_generic(net, nfs_net_id);
-
- nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
- rpc_destroy_pipe_data(nn->bl_device_pipe);
- nn->bl_device_pipe = NULL;
-}
-
-static struct pernet_operations nfs4blocklayout_net_ops = {
- .init = nfs4blocklayout_net_init,
- .exit = nfs4blocklayout_net_exit,
-};
-
static int __init nfs4blocklayout_init(void)
{
int ret;
@@ -998,20 +872,14 @@ static int __init nfs4blocklayout_init(void)
ret = pnfs_register_layoutdriver(&blocklayout_type);
if (ret)
goto out;
-
- ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
- if (ret)
- goto out_remove;
- ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+ ret = bl_init_pipefs();
if (ret)
- goto out_notifier;
-out:
- return ret;
+ goto out_unregister;
+ return 0;

-out_notifier:
- rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
-out_remove:
+out_unregister:
pnfs_unregister_layoutdriver(&blocklayout_type);
+out:
return ret;
}

@@ -1020,8 +888,7 @@ static void __exit nfs4blocklayout_exit(void)
dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
__func__);

- rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
- unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+ bl_cleanup_pipefs();
pnfs_unregister_layoutdriver(&blocklayout_type);
}

diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 00c11eb..c98d98a 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -110,17 +110,6 @@ struct bl_msg_hdr {
#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */

-/* blocklayoutdev.c */
-ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-
-struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
- struct pnfs_device *pdev, gfp_t gfp_mask);
-void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
-
-/* blocklayoutdm.c */
-void bl_dm_remove(struct net *net, dev_t dev);
-
/* extent_tree.c */
int ext_tree_insert(struct pnfs_block_layout *bl,
struct pnfs_block_extent *new);
@@ -133,4 +122,12 @@ bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);

+/* rpc_pipefs.c */
+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
+int __init bl_init_pipefs(void);
+void __exit bl_cleanup_pipefs(void);
+
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
deleted file mode 100644
index 2b54e29..0000000
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * linux/fs/nfs/blocklayout/blocklayoutdev.c
- *
- * Device operations for the pnfs nfs4 file layout driver.
- *
- * Copyright (c) 2006 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Andy Adamson <[email protected]>
- * Fred Isaman <[email protected]>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization. if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose. the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-#include <linux/module.h>
-#include <linux/buffer_head.h> /* __bread */
-
-#include <linux/genhd.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
- size_t mlen)
-{
- struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
- nfs_net_id);
-
- if (mlen != sizeof (struct bl_dev_msg))
- return -EINVAL;
-
- if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
- return -EFAULT;
-
- wake_up(&nn->bl_wq);
-
- return mlen;
-}
-
-void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
-{
- struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
-
- if (msg->errno >= 0)
- return;
- wake_up(bl_pipe_msg->bl_wq);
-}
-
-/*
- * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
- */
-struct nfs4_deviceid_node *
-bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
- gfp_t gfp_mask)
-{
- struct pnfs_block_dev *rv;
- struct block_device *bd;
- struct bl_pipe_msg bl_pipe_msg;
- struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
- struct bl_msg_hdr bl_msg = {
- .type = BL_DEVICE_MOUNT,
- .totallen = dev->mincount,
- };
- uint8_t *dataptr;
- DECLARE_WAITQUEUE(wq, current);
- int offset, len, i, rc;
- struct net *net = server->nfs_client->cl_net;
- struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct bl_dev_msg *reply = &nn->bl_mount_reply;
-
- dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
- dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
- dev->mincount);
-
- bl_pipe_msg.bl_wq = &nn->bl_wq;
- memset(msg, 0, sizeof(*msg));
- msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, gfp_mask);
- if (!msg->data)
- goto out;
-
- memcpy(msg->data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg->data;
- len = dev->mincount;
- offset = sizeof(bl_msg);
- for (i = 0; len > 0; i++) {
- memcpy(&dataptr[offset], page_address(dev->pages[i]),
- len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
- len -= PAGE_CACHE_SIZE;
- offset += PAGE_CACHE_SIZE;
- }
- msg->len = sizeof(bl_msg) + dev->mincount;
-
- dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
- add_wait_queue(&nn->bl_wq, &wq);
- rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
- if (rc < 0) {
- remove_wait_queue(&nn->bl_wq, &wq);
- goto out;
- }
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nn->bl_wq, &wq);
-
- if (reply->status != BL_DEVICE_REQUEST_PROC) {
- printk(KERN_WARNING "%s failed to decode device: %d\n",
- __func__, reply->status);
- goto out;
- }
-
- bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
- FMODE_READ, NULL);
- if (IS_ERR(bd)) {
- printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n",
- __func__, reply->major, reply->minor,
- PTR_ERR(bd));
- goto out;
- }
-
- rv = kzalloc(sizeof(*rv), gfp_mask);
- if (!rv)
- goto out;
-
- nfs4_init_deviceid_node(&rv->d_node, server, &dev->dev_id);
- rv->d_bdev = bd;
-
- dprintk("%s Created device %s with bd_block_size %u\n",
- __func__,
- bd->bd_disk->disk_name,
- bd->bd_block_size);
-
- kfree(msg->data);
- return &rv->d_node;
-
-out:
- kfree(msg->data);
- return NULL;
-}
-
-void
-bl_free_deviceid_node(struct nfs4_deviceid_node *d)
-{
- struct pnfs_block_dev *dev =
- container_of(d, struct pnfs_block_dev, d_node);
- struct net *net = d->nfs_client->cl_net;
-
- blkdev_put(dev->d_bdev, FMODE_READ);
- bl_dm_remove(net, dev->d_bdev->bd_dev);
-
- kfree(dev);
-}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
deleted file mode 100644
index abc2e9e..0000000
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * linux/fs/nfs/blocklayout/blocklayoutdm.c
- *
- * Module for the NFSv4.1 pNFS block layout driver.
- *
- * Copyright (c) 2007 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Fred Isaman <[email protected]>
- * Andy Adamson <[email protected]>
- *
- * permission is granted to use, copy, create derivative works and
- * redistribute this software and such derivative works for any purpose,
- * so long as the name of the university of michigan is not used in
- * any advertising or publicity pertaining to the use or distribution
- * of this software without specific, written prior authorization. if
- * the above copyright notice or any other identification of the
- * university of michigan is included in any copy of any portion of
- * this software, then the disclaimer below must also be included.
- *
- * this software is provided as is, without representation from the
- * university of michigan as to its fitness for any purpose, and without
- * warranty by the university of michigan of any kind, either express
- * or implied, including without limitation the implied warranties of
- * merchantability and fitness for a particular purpose. the regents
- * of the university of michigan shall not be liable for any damages,
- * including special, indirect, incidental, or consequential damages,
- * with respect to any claim arising out or in connection with the use
- * of the software, even if it has been or is hereafter advised of the
- * possibility of such damages.
- */
-
-#include <linux/genhd.h> /* gendisk - used in a dprintk*/
-#include <linux/sched.h>
-#include <linux/hash.h>
-
-#include "blocklayout.h"
-
-#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-
-void bl_dm_remove(struct net *net, dev_t dev)
-{
- struct bl_pipe_msg bl_pipe_msg;
- struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
- struct bl_dev_msg bl_umount_request;
- struct bl_msg_hdr bl_msg = {
- .type = BL_DEVICE_UMOUNT,
- .totallen = sizeof(bl_umount_request),
- };
- uint8_t *dataptr;
- DECLARE_WAITQUEUE(wq, current);
- struct nfs_net *nn = net_generic(net, nfs_net_id);
-
- dprintk("Entering %s\n", __func__);
-
- bl_pipe_msg.bl_wq = &nn->bl_wq;
- memset(msg, 0, sizeof(*msg));
- msg->len = sizeof(bl_msg) + bl_msg.totallen;
- msg->data = kzalloc(msg->len, GFP_NOFS);
- if (!msg->data)
- goto out;
-
- memset(&bl_umount_request, 0, sizeof(bl_umount_request));
- bl_umount_request.major = MAJOR(dev);
- bl_umount_request.minor = MINOR(dev);
-
- memcpy(msg->data, &bl_msg, sizeof(bl_msg));
- dataptr = (uint8_t *) msg->data;
- memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
-
- add_wait_queue(&nn->bl_wq, &wq);
- if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
- remove_wait_queue(&nn->bl_wq, &wq);
- goto out;
- }
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nn->bl_wq, &wq);
-
-out:
- kfree(msg->data);
-}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 0000000..bfb0486
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2006,2007 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <[email protected]>
+ * Fred Isaman <[email protected]>
+ *
+ * permission is granted to use, copy, create derivative works and
+ * redistribute this software and such derivative works for any purpose,
+ * so long as the name of the university of michigan is not used in
+ * any advertising or publicity pertaining to the use or distribution
+ * of this software without specific, written prior authorization. if
+ * the above copyright notice or any other identification of the
+ * university of michigan is included in any copy of any portion of
+ * this software, then the disclaimer below must also be included.
+ *
+ * this software is provided as is, without representation from the
+ * university of michigan as to its fitness for any purpose, and without
+ * warranty by the university of michigan of any kind, either express
+ * or implied, including without limitation the implied warranties of
+ * merchantability and fitness for a particular purpose. the regents
+ * of the university of michigan shall not be liable for any damages,
+ * including special, indirect, incidental, or consequential damages,
+ * with respect to any claim arising out or in connection with the use
+ * of the software, even if it has been or is hereafter advised of the
+ * possibility of such damages.
+ */
+
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blocklayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static void bl_dm_remove(struct net *net, dev_t dev)
+{
+ struct bl_pipe_msg bl_pipe_msg;
+ struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+ struct bl_dev_msg bl_umount_request;
+ struct bl_msg_hdr bl_msg = {
+ .type = BL_DEVICE_UMOUNT,
+ .totallen = sizeof(bl_umount_request),
+ };
+ uint8_t *dataptr;
+ DECLARE_WAITQUEUE(wq, current);
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ dprintk("Entering %s\n", __func__);
+
+ bl_pipe_msg.bl_wq = &nn->bl_wq;
+ memset(msg, 0, sizeof(*msg));
+ msg->len = sizeof(bl_msg) + bl_msg.totallen;
+ msg->data = kzalloc(msg->len, GFP_NOFS);
+ if (!msg->data)
+ goto out;
+
+ memset(&bl_umount_request, 0, sizeof(bl_umount_request));
+ bl_umount_request.major = MAJOR(dev);
+ bl_umount_request.minor = MINOR(dev);
+
+ memcpy(msg->data, &bl_msg, sizeof(bl_msg));
+ dataptr = (uint8_t *) msg->data;
+ memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
+
+ add_wait_queue(&nn->bl_wq, &wq);
+ if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
+ remove_wait_queue(&nn->bl_wq, &wq);
+ goto out;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&nn->bl_wq, &wq);
+
+out:
+ kfree(msg->data);
+}
+
+/*
+ * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
+ */
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
+ gfp_t gfp_mask)
+{
+ struct pnfs_block_dev *rv;
+ struct block_device *bd;
+ struct bl_pipe_msg bl_pipe_msg;
+ struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
+ struct bl_msg_hdr bl_msg = {
+ .type = BL_DEVICE_MOUNT,
+ .totallen = dev->mincount,
+ };
+ uint8_t *dataptr;
+ DECLARE_WAITQUEUE(wq, current);
+ int offset, len, i, rc;
+ struct net *net = server->nfs_client->cl_net;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct bl_dev_msg *reply = &nn->bl_mount_reply;
+
+ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
+ dev->mincount);
+
+ bl_pipe_msg.bl_wq = &nn->bl_wq;
+ memset(msg, 0, sizeof(*msg));
+ msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, gfp_mask);
+ if (!msg->data)
+ goto out;
+
+ memcpy(msg->data, &bl_msg, sizeof(bl_msg));
+ dataptr = (uint8_t *) msg->data;
+ len = dev->mincount;
+ offset = sizeof(bl_msg);
+ for (i = 0; len > 0; i++) {
+ memcpy(&dataptr[offset], page_address(dev->pages[i]),
+ len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
+ len -= PAGE_CACHE_SIZE;
+ offset += PAGE_CACHE_SIZE;
+ }
+ msg->len = sizeof(bl_msg) + dev->mincount;
+
+ dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
+ add_wait_queue(&nn->bl_wq, &wq);
+ rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
+ if (rc < 0) {
+ remove_wait_queue(&nn->bl_wq, &wq);
+ goto out;
+ }
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&nn->bl_wq, &wq);
+
+ if (reply->status != BL_DEVICE_REQUEST_PROC) {
+ printk(KERN_WARNING "%s failed to decode device: %d\n",
+ __func__, reply->status);
+ goto out;
+ }
+
+ bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
+ FMODE_READ, NULL);
+ if (IS_ERR(bd)) {
+ printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n",
+ __func__, reply->major, reply->minor,
+ PTR_ERR(bd));
+ goto out;
+ }
+
+ rv = kzalloc(sizeof(*rv), gfp_mask);
+ if (!rv)
+ goto out;
+
+ nfs4_init_deviceid_node(&rv->d_node, server, &dev->dev_id);
+ rv->d_bdev = bd;
+
+ dprintk("%s Created device %s with bd_block_size %u\n",
+ __func__,
+ bd->bd_disk->disk_name,
+ bd->bd_block_size);
+
+ kfree(msg->data);
+ return &rv->d_node;
+
+out:
+ kfree(msg->data);
+ return NULL;
+}
+
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+ struct pnfs_block_dev *dev =
+ container_of(d, struct pnfs_block_dev, d_node);
+ struct net *net = d->nfs_client->cl_net;
+
+ blkdev_put(dev->d_bdev, FMODE_READ);
+ bl_dm_remove(net, dev->d_bdev->bd_dev);
+
+ kfree(dev);
+}
+
+static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
+ size_t mlen)
+{
+ struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
+ nfs_net_id);
+
+ if (mlen != sizeof (struct bl_dev_msg))
+ return -EINVAL;
+
+ if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
+ return -EFAULT;
+
+ wake_up(&nn->bl_wq);
+
+ return mlen;
+}
+
+static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct bl_pipe_msg *bl_pipe_msg =
+ container_of(msg, struct bl_pipe_msg, msg);
+
+ if (msg->errno >= 0)
+ return;
+ wake_up(bl_pipe_msg->bl_wq);
+}
+
+static const struct rpc_pipe_ops bl_upcall_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = bl_pipe_downcall,
+ .destroy_msg = bl_pipe_destroy_msg,
+};
+
+static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ struct dentry *dir, *dentry;
+
+ dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
+ if (dir == NULL)
+ return ERR_PTR(-ENOENT);
+ dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+ dput(dir);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_sb(struct super_block *sb,
+ struct rpc_pipe *pipe)
+{
+ if (pipe->dentry)
+ rpc_unlink(pipe->dentry);
+}
+
+static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct super_block *sb = ptr;
+ struct net *net = sb->s_fs_info;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+ int ret = 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return 0;
+
+ if (nn->bl_device_pipe == NULL) {
+ module_put(THIS_MODULE);
+ return 0;
+ }
+
+ switch (event) {
+ case RPC_PIPEFS_MOUNT:
+ dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ break;
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ break;
+ case RPC_PIPEFS_UMOUNT:
+ if (nn->bl_device_pipe->dentry)
+ nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static struct notifier_block nfs4blocklayout_block = {
+ .notifier_call = rpc_pipefs_event,
+};
+
+static struct dentry *nfs4blocklayout_register_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+ struct dentry *dentry;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (!pipefs_sb)
+ return NULL;
+ dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ return dentry;
+}
+
+static void nfs4blocklayout_unregister_net(struct net *net,
+ struct rpc_pipe *pipe)
+{
+ struct super_block *pipefs_sb;
+
+ pipefs_sb = rpc_get_sb_net(net);
+ if (pipefs_sb) {
+ nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+ rpc_put_sb_net(net);
+ }
+}
+
+static int nfs4blocklayout_net_init(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+ struct dentry *dentry;
+
+ init_waitqueue_head(&nn->bl_wq);
+ nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
+ if (IS_ERR(nn->bl_device_pipe))
+ return PTR_ERR(nn->bl_device_pipe);
+ dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+ if (IS_ERR(dentry)) {
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ return PTR_ERR(dentry);
+ }
+ nn->bl_device_pipe->dentry = dentry;
+ return 0;
+}
+
+static void nfs4blocklayout_net_exit(struct net *net)
+{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+ nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
+ rpc_destroy_pipe_data(nn->bl_device_pipe);
+ nn->bl_device_pipe = NULL;
+}
+
+static struct pernet_operations nfs4blocklayout_net_ops = {
+ .init = nfs4blocklayout_net_init,
+ .exit = nfs4blocklayout_net_exit,
+};
+
+int __init bl_init_pipefs(void)
+{
+ int ret;
+
+ ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
+ if (ret)
+ goto out;
+ ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
+ if (ret)
+ goto out_unregister_notifier;
+ return 0;
+
+out_unregister_notifier:
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+out:
+ return ret;
+}
+
+void __exit bl_cleanup_pipefs(void)
+{
+ rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
+ unregister_pernet_subsys(&nfs4blocklayout_net_ops);
+}
--
1.9.1