2014-08-11 20:04:42

by Christoph Hellwig

[permalink] [raw]
Subject: pnfs: factor GETDEVICEINFO implementations

This series adds a common implementation of the GETDEVICELIST operation,
and switches the block layout driver to use the generic device id cache.

This is mostly done by moving well tested code from the file and block
layout drivers to common code. I've only tested it using the block
layout driver, so testing with the other layouts would be greatly
appreciated. I'm mostly concerned about the object layout driver as
it had significant differences and missing bug fixes compared to the
others.

Note that the last patch requires my previous block layout series, but
the others should work fine without it.

This work was sponsored by NetApp, Inc.



2014-08-11 20:04:45

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 3/4] pnfs: add a nfs4_get_deviceid helper

This will be used by the block layout driver when splitting extents.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/nfs/pnfs.h | 7 +++++++
1 file changed, 7 insertions(+)

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index ba468e2..c27fe18 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -277,6 +277,13 @@ void nfs4_deviceid_purge_client(const struct nfs_client *);
int nfs4_deviceid_getdevicelist(struct nfs_server *server,
const struct nfs_fh *fh);

+static inline struct nfs4_deviceid_node *
+nfs4_get_deviceid(struct nfs4_deviceid_node *d)
+{
+ atomic_inc(&d->ref);
+ return d;
+}
+
static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
--
1.9.1


2014-08-11 20:04:47

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 4/4] pnfs/blocklayout: use the device id cache

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/nfs/blocklayout/blocklayout.c | 149 ++----------------------------------
fs/nfs/blocklayout/blocklayout.h | 25 ++----
fs/nfs/blocklayout/blocklayoutdev.c | 88 +++++++++------------
fs/nfs/blocklayout/blocklayoutdm.c | 26 +------
fs/nfs/blocklayout/extent_tree.c | 27 ++++---
5 files changed, 65 insertions(+), 250 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index fae8144..996578a 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -119,6 +119,8 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
void (*end_io)(struct bio *, int err),
struct parallel_io *par)
{
+ struct pnfs_block_dev *dev =
+ container_of(be->be_device, struct pnfs_block_dev, d_node);
struct bio *bio;

npg = min(npg, BIO_MAX_PAGES);
@@ -131,7 +133,7 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
if (bio) {
bio->bi_iter.bi_sector = isect - be->be_f_offset +
be->be_v_offset;
- bio->bi_bdev = be->be_mdev;
+ bio->bi_bdev = dev->d_bdev;
bio->bi_end_io = end_io;
bio->bi_private = par;
}
@@ -514,96 +516,9 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
ext_tree_mark_committed(BLK_LO2EXT(lo), lcdata->res.status);
}

-static void free_blk_mountid(struct block_mount_id *mid)
-{
- if (mid) {
- struct pnfs_block_dev *dev, *tmp;
-
- /* No need to take bm_lock as we are last user freeing bm_devlist */
- list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
- list_del(&dev->bm_node);
- bl_free_block_dev(dev);
- }
- kfree(mid);
- }
-}
-
-/* This is mostly copied from the filelayout_get_device_info function.
- * It seems much of this should be at the generic pnfs level.
- */
-static struct pnfs_block_dev *
-nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
- struct nfs4_deviceid *d_id)
-{
- struct pnfs_device *dev;
- struct pnfs_block_dev *rv;
- u32 max_resp_sz;
- int max_pages;
- struct page **pages = NULL;
- int i, rc;
-
- /*
- * Use the session max response size as the basis for setting
- * GETDEVICEINFO's maxcount
- */
- max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
- max_pages = nfs_page_array_len(0, max_resp_sz);
- dprintk("%s max_resp_sz %u max_pages %d\n",
- __func__, max_resp_sz, max_pages);
-
- dev = kmalloc(sizeof(*dev), GFP_NOFS);
- if (!dev) {
- dprintk("%s kmalloc failed\n", __func__);
- return ERR_PTR(-ENOMEM);
- }
-
- pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS);
- if (pages == NULL) {
- kfree(dev);
- return ERR_PTR(-ENOMEM);
- }
- for (i = 0; i < max_pages; i++) {
- pages[i] = alloc_page(GFP_NOFS);
- if (!pages[i]) {
- rv = ERR_PTR(-ENOMEM);
- goto out_free;
- }
- }
-
- memcpy(&dev->dev_id, d_id, sizeof(*d_id));
- dev->layout_type = LAYOUT_BLOCK_VOLUME;
- dev->pages = pages;
- dev->pgbase = 0;
- dev->pglen = PAGE_SIZE * max_pages;
- dev->mincount = 0;
- dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
- dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
- rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
- dprintk("%s getdevice info returns %d\n", __func__, rc);
- if (rc) {
- rv = ERR_PTR(rc);
- goto out_free;
- }
-
- rv = nfs4_blk_decode_device(server, dev);
- out_free:
- for (i = 0; i < max_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- kfree(dev);
- return rv;
-}
-
static int
bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
{
- struct block_mount_id *b_mt_id = NULL;
- struct pnfs_devicelist *dlist = NULL;
- struct pnfs_block_dev *bdev;
- LIST_HEAD(block_disklist);
- int status, i;
-
dprintk("%s enter\n", __func__);

if (server->pnfs_blksize == 0) {
@@ -616,60 +531,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
return -EINVAL;
}

- b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
- if (!b_mt_id) {
- status = -ENOMEM;
- goto out_error;
- }
- /* Initialize nfs4 block layout mount id */
- spin_lock_init(&b_mt_id->bm_lock);
- INIT_LIST_HEAD(&b_mt_id->bm_devlist);
-
- dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
- if (!dlist) {
- status = -ENOMEM;
- goto out_error;
- }
- dlist->eof = 0;
- while (!dlist->eof) {
- status = nfs4_proc_getdevicelist(server, fh, dlist);
- if (status)
- goto out_error;
- dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
- __func__, dlist->num_devs, dlist->eof);
- for (i = 0; i < dlist->num_devs; i++) {
- bdev = nfs4_blk_get_deviceinfo(server, fh,
- &dlist->dev_id[i]);
- if (IS_ERR(bdev)) {
- status = PTR_ERR(bdev);
- goto out_error;
- }
- spin_lock(&b_mt_id->bm_lock);
- list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
- spin_unlock(&b_mt_id->bm_lock);
- }
- }
- dprintk("%s SUCCESS\n", __func__);
- server->pnfs_ld_data = b_mt_id;
-
- out_return:
- kfree(dlist);
- return status;
-
- out_error:
- free_blk_mountid(b_mt_id);
- goto out_return;
-}
-
-static int
-bl_clear_layoutdriver(struct nfs_server *server)
-{
- struct block_mount_id *b_mt_id = server->pnfs_ld_data;
-
- dprintk("%s enter\n", __func__);
- free_blk_mountid(b_mt_id);
- dprintk("%s RETURNS\n", __func__);
- return 0;
+ return nfs4_deviceid_getdevicelist(server, fh);
}

static bool
@@ -810,7 +672,8 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.prepare_layoutcommit = bl_prepare_layoutcommit,
.cleanup_layoutcommit = bl_cleanup_layoutcommit,
.set_layoutdriver = bl_set_layoutdriver,
- .clear_layoutdriver = bl_clear_layoutdriver,
+ .alloc_deviceid_node = bl_alloc_deviceid_node,
+ .free_deviceid_node = bl_free_deviceid_node,
.pg_read_ops = &bl_pg_read_ops,
.pg_write_ops = &bl_pg_write_ops,
};
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index caae202..bdcc588 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,16 +44,9 @@
#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
#define SECTOR_SIZE (1 << SECTOR_SHIFT)

-struct block_mount_id {
- spinlock_t bm_lock; /* protects list */
- struct list_head bm_devlist; /* holds pnfs_block_dev */
-};
-
struct pnfs_block_dev {
- struct list_head bm_node;
- struct nfs4_deviceid bm_mdevid; /* associated devid */
- struct block_device *bm_mdev; /* meta device itself */
- struct net *net;
+ struct nfs4_deviceid_node d_node;
+ struct block_device *d_bdev;
};

enum exstate4 {
@@ -69,8 +62,7 @@ struct pnfs_block_extent {
struct rb_node be_node;
struct list_head be_list;
};
- struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */
- struct block_device *be_mdev;
+ struct nfs4_deviceid_node *be_device;
sector_t be_f_offset; /* the starting offset in the file */
sector_t be_length; /* the size of the extent */
sector_t be_v_offset; /* the starting offset in the volume */
@@ -90,8 +82,6 @@ struct pnfs_block_layout {
spinlock_t bl_ext_lock; /* Protects list manipulation */
};

-#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
-
static inline struct pnfs_block_layout *
BLK_LO2EXT(struct pnfs_layout_hdr *lo)
{
@@ -123,14 +113,15 @@ struct bl_msg_hdr {
/* blocklayoutdev.c */
ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-void nfs4_blkdev_put(struct block_device *bdev);
-struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
- struct pnfs_device *dev);
int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);

+struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_mask);
+void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
+
/* blocklayoutdm.c */
-void bl_free_block_dev(struct pnfs_block_dev *bdev);
+void bl_dm_remove(struct net *net, dev_t dev);

/* extent_tree.c */
int ext_tree_insert(struct pnfs_block_layout *bl,
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index cd71b5e..d6527d2 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -53,16 +53,6 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
return 0;
}

-/*
- * Release the block device
- */
-void nfs4_blkdev_put(struct block_device *bdev)
-{
- dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
- MINOR(bdev->bd_dev));
- blkdev_put(bdev, FMODE_READ);
-}
-
ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
size_t mlen)
{
@@ -92,12 +82,12 @@ void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
/*
* Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
*/
-struct pnfs_block_dev *
-nfs4_blk_decode_device(struct nfs_server *server,
- struct pnfs_device *dev)
+struct nfs4_deviceid_node *
+bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
+ gfp_t gfp_mask)
{
struct pnfs_block_dev *rv;
- struct block_device *bd = NULL;
+ struct block_device *bd;
struct bl_pipe_msg bl_pipe_msg;
struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
struct bl_msg_hdr bl_msg = {
@@ -117,11 +107,9 @@ nfs4_blk_decode_device(struct nfs_server *server,

bl_pipe_msg.bl_wq = &nn->bl_wq;
memset(msg, 0, sizeof(*msg));
- msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
- if (!msg->data) {
- rv = ERR_PTR(-ENOMEM);
+ msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, gfp_mask);
+ if (!msg->data)
goto out;
- }

memcpy(msg->data, &bl_msg, sizeof(bl_msg));
dataptr = (uint8_t *) msg->data;
@@ -140,7 +128,6 @@ nfs4_blk_decode_device(struct nfs_server *server,
rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
if (rc < 0) {
remove_wait_queue(&nn->bl_wq, &wq);
- rv = ERR_PTR(rc);
goto out;
}

@@ -152,7 +139,6 @@ nfs4_blk_decode_device(struct nfs_server *server,
if (reply->status != BL_DEVICE_REQUEST_PROC) {
printk(KERN_WARNING "%s failed to decode device: %d\n",
__func__, reply->status);
- rv = ERR_PTR(-EINVAL);
goto out;
}

@@ -162,51 +148,40 @@ nfs4_blk_decode_device(struct nfs_server *server,
printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n",
__func__, reply->major, reply->minor,
PTR_ERR(bd));
- rv = ERR_CAST(bd);
goto out;
}

- rv = kzalloc(sizeof(*rv), GFP_NOFS);
- if (!rv) {
- rv = ERR_PTR(-ENOMEM);
+ rv = kzalloc(sizeof(*rv), gfp_mask);
+ if (!rv)
goto out;
- }

- rv->bm_mdev = bd;
- memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
- rv->net = net;
+ nfs4_init_deviceid_node(&rv->d_node, server, &dev->dev_id);
+ rv->d_bdev = bd;
+
dprintk("%s Created device %s with bd_block_size %u\n",
__func__,
bd->bd_disk->disk_name,
bd->bd_block_size);

+ kfree(msg->data);
+ return &rv->d_node;
+
out:
kfree(msg->data);
- return rv;
+ return NULL;
}

-/* Map deviceid returned by the server to constructed block_device */
-static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
- struct nfs4_deviceid *id)
+void
+bl_free_deviceid_node(struct nfs4_deviceid_node *d)
{
- struct block_device *rv = NULL;
- struct block_mount_id *mid;
- struct pnfs_block_dev *dev;
-
- dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
- mid = BLK_ID(lo);
- spin_lock(&mid->bm_lock);
- list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
- if (memcmp(id->data, dev->bm_mdevid.data,
- NFS4_DEVICEID4_SIZE) == 0) {
- rv = dev->bm_mdev;
- goto out;
- }
- }
- out:
- spin_unlock(&mid->bm_lock);
- dprintk("%s returning %p\n", __func__, rv);
- return rv;
+ struct pnfs_block_dev *dev =
+ container_of(d, struct pnfs_block_dev, d_node);
+ struct net *net = d->nfs_client->cl_net;
+
+ blkdev_put(dev->d_bdev, FMODE_READ);
+ bl_dm_remove(net, dev->d_bdev->bd_dev);
+
+ kfree(dev);
}

/* Tracks info needed to ensure extents in layout obey constraints of spec */
@@ -309,15 +284,20 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
* recovery easier.
*/
for (i = 0; i < count; i++) {
+ struct nfs4_deviceid id;
+
be = kzalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
if (!be) {
status = -ENOMEM;
goto out_err;
}
- memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
+ memcpy(&id, p, NFS4_DEVICEID4_SIZE);
p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
- be->be_mdev = translate_devid(lo, &be->be_devid);
- if (!be->be_mdev)
+
+ be->be_device =
+ nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
+ lo->plh_lc_cred, gfp_flags);
+ if (!be->be_device)
goto out_err;

/* The next three values are read in as bytes,
@@ -364,12 +344,14 @@ nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
return status;

out_err:
+ nfs4_put_deviceid_node(be->be_device);
kfree(be);
out_free_list:
while (!list_empty(&extents)) {
be = list_first_entry(&extents, struct pnfs_block_extent,
be_list);
list_del(&be->be_list);
+ nfs4_put_deviceid_node(be->be_device);
kfree(be);
}
goto out;
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
index 8999cfd..abc2e9e 100644
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -38,7 +38,7 @@

#define NFSDBG_FACILITY NFSDBG_PNFS_LD

-static void dev_remove(struct net *net, dev_t dev)
+void bl_dm_remove(struct net *net, dev_t dev)
{
struct bl_pipe_msg bl_pipe_msg;
struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
@@ -82,27 +82,3 @@ static void dev_remove(struct net *net, dev_t dev)
out:
kfree(msg->data);
}
-
-/*
- * Release meta device
- */
-static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
-{
- dprintk("%s Releasing\n", __func__);
- nfs4_blkdev_put(bdev->bm_mdev);
- dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
-}
-
-void bl_free_block_dev(struct pnfs_block_dev *bdev)
-{
- if (bdev) {
- if (bdev->bm_mdev) {
- dprintk("%s Removing DM device: %d:%d\n",
- __func__,
- MAJOR(bdev->bm_mdev->bd_dev),
- MINOR(bdev->bm_mdev->bd_dev));
- nfs4_blk_metadev_release(bdev);
- }
- kfree(bdev);
- }
-}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 3945221..32398dd 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -69,7 +69,7 @@ ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
{
if (be1->be_state != be2->be_state)
return false;
- if (be1->be_mdev != be2->be_mdev)
+ if (be1->be_device != be2->be_device)
return false;

if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
@@ -94,6 +94,7 @@ ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
if (left && ext_can_merge(left, be)) {
left->be_length += be->be_length;
rb_erase(&be->be_node, root);
+ nfs4_put_deviceid_node(be->be_device);
kfree(be);
return left;
}
@@ -109,6 +110,7 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
if (right && ext_can_merge(be, right)) {
be->be_length += right->be_length;
rb_erase(&right->be_node, root);
+ nfs4_put_deviceid_node(right->be_device);
kfree(right);
}

@@ -133,16 +135,14 @@ __ext_tree_insert(struct rb_root *root,
be->be_v_offset = new->be_v_offset;
be->be_length += new->be_length;
be = ext_try_to_merge_left(root, be);
- kfree(new);
- return;
+ goto free_new;
}
p = &(*p)->rb_left;
} else if (new->be_f_offset >= ext_f_end(be)) {
if (merge_ok && ext_can_merge(be, new)) {
be->be_length += new->be_length;
be = ext_try_to_merge_right(root, be);
- kfree(new);
- return;
+ goto free_new;
}
p = &(*p)->rb_right;
} else {
@@ -152,6 +152,10 @@ __ext_tree_insert(struct rb_root *root,

rb_link_node(&new->be_node, parent, p);
rb_insert_color(&new->be_node, root);
+ return;
+free_new:
+ nfs4_put_deviceid_node(new->be_device);
+ kfree(new);
}

static int
@@ -196,9 +200,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
new->be_length = len2;
new->be_state = be->be_state;
new->be_tag = be->be_tag;
- new->be_mdev = be->be_mdev;
- memcpy(&new->be_devid, &be->be_devid,
- sizeof(struct nfs4_deviceid));
+ new->be_device = nfs4_get_deviceid(be->be_device);

__ext_tree_insert(root, new, true);
} else {
@@ -219,6 +221,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
struct pnfs_block_extent *next = ext_tree_next(be);

rb_erase(&be->be_node, root);
+ nfs4_put_deviceid_node(be->be_device);
kfree(be);
be = next;
}
@@ -263,6 +266,7 @@ retry:
__ext_tree_insert(root, new, true);
} else if (new->be_f_offset >= be->be_f_offset) {
if (ext_f_end(new) <= ext_f_end(be)) {
+ nfs4_put_deviceid_node(new->be_device);
kfree(new);
} else {
sector_t new_len = ext_f_end(new) - ext_f_end(be);
@@ -288,6 +292,7 @@ retry:
}

split->be_length = be->be_f_offset - split->be_f_offset;
+ split->be_device = nfs4_get_deviceid(new->be_device);
__ext_tree_insert(root, split, true);

new->be_f_offset += diff;
@@ -378,9 +383,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
new->be_length = orig_len - be->be_length;
new->be_state = be->be_state;
new->be_tag = be->be_tag;
-
- new->be_mdev = be->be_mdev;
- memcpy(&new->be_devid, &be->be_devid, sizeof(struct nfs4_deviceid));
+ new->be_device = nfs4_get_deviceid(be->be_device);

dprintk("%s: got 0x%lx:0x%lx!\n",
__func__, be->be_f_offset, ext_f_end(be));
@@ -484,7 +487,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
continue;
}

- p = xdr_encode_opaque_fixed(p, be->be_devid.data,
+ p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
NFS4_DEVICEID4_SIZE);
p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
--
1.9.1


2014-08-21 14:46:54

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 1/4] pnfs: factor GETDEVICEINFO implementations

Boaz,

did you get a chance to test this against and object layout server?

Did anyone manage to test it against a file layout server?


2014-08-12 15:53:25

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 1/4] pnfs: factor GETDEVICEINFO implementations

Ok, I've pushed a commit to implement the max_getdeviceinfo_size
field out.


2014-08-12 12:21:59

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 1/4] pnfs: factor GETDEVICEINFO implementations

[Can you please trim your quotes? Quoting 700+ lines of a patch for
a 30 line reply is completely unreasonable, and placing the reply in
the middle of it is even worse. I will ignore mails ignoring the
netiquette this blatantly in the future]

On Tue, Aug 12, 2014 at 02:36:55PM +0300, Boaz Harrosh wrote:
> > + /*
> > + * Use the session max response size as the basis for setting
> > + * GETDEVICEINFO's maxcount
> > + */
> > + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
> > + max_pages = nfs_page_array_len(0, max_resp_sz);
> > + dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
> > + __func__, server, max_resp_sz, max_pages);
> > +
>
> This is an extremely too big an allocation for obj-lo (which has
> a couple of embedded strings here). The all RPC can fit a single
> page
>
> Should we put like a flag in struct pnfs_layoutdriver_type:
>
> if (server->pnfs_curr_ld->flags & PNFS_DEVINFO_SINGLE_PAGE) {
> max_pages = 1;
> max_resp_sz = PAGE_SIZE;
> }
>
> This gives us so many extra allocation for storing one page pointer but for
> the simplicity of the cleanup we can live with it.

Sounds fine to me, but do you really have that many GETDEVICEINFO calls
in object layout setups that it's worth the effort?

Another slightly cleaner option would be to have a max_deviceinfo_size
field in the layout driver and cap the size by it.


2014-08-21 16:39:44

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH 1/4] pnfs: factor GETDEVICEINFO implementations

On 08/21/2014 05:46 PM, Christoph Hellwig wrote:
> Boaz,
>
> did you get a chance to test this against and object layout server?
>
> Did anyone manage to test it against a file layout server?
>
>

So sorry was distracted these two week. Will test tomorrow I promise

Thanks
Boaz


2014-08-12 09:52:44

by Christoph Hellwig

[permalink] [raw]
Subject: Re: pnfs: factor GETDEVICEINFO implementations

On Tue, Aug 12, 2014 at 12:35:25PM +0300, Boaz Harrosh wrote:
> > This is mostly done by moving well tested code from the file and block
> > layout drivers to common code. I've only tested it using the block
> > layout driver, so testing with the other layouts would be greatly
> > appreciated. I'm mostly concerned about the object layout driver as
> > it had significant differences and missing bug fixes compared to the
> > others.
> >
> Hi Christoph
>
> Please say what BUGs have you seen in object-layout-drive?

I've not seen actual bugs, but it's been missing various minor bug
fixes that were applied to the file layout version.

> Do you have this on a public tree somewhere? I would like to test it?

I didn't bother for just those few patches, but I've created a "getdeviceinfo"
branch at git://git.infradead.org/users/hch/pnfs.git


2014-08-11 20:04:44

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 2/4] pnfs add a common GETDEVICELIST implementation

At a simple helper to issue a GETDEVICELIST operation and pre-load
the device id cache based on the result.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/nfs/pnfs.h | 2 ++
fs/nfs/pnfs_dev.c | 28 ++++++++++++++++++++++++++++
2 files changed, 30 insertions(+)

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e145b79..ba468e2 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -274,6 +274,8 @@ bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
void nfs4_deviceid_purge_client(const struct nfs_client *);
+int nfs4_deviceid_getdevicelist(struct nfs_server *server,
+ const struct nfs_fh *fh);

static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index e3d4fa9..a30e6b2 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -356,3 +356,31 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
rcu_read_unlock();
}

+int
+nfs4_deviceid_getdevicelist(struct nfs_server *server,
+ const struct nfs_fh *fh)
+{
+ struct pnfs_devicelist *dlist;
+ struct nfs4_deviceid_node *d;
+ int error = 0, i;
+
+ dlist = kzalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
+ if (!dlist)
+ return -ENOMEM;
+
+ while (!dlist->eof) {
+ error = nfs4_proc_getdevicelist(server, fh, dlist);
+ if (error)
+ break;
+
+ for (i = 0; i < dlist->num_devs; i++) {
+ d = nfs4_find_get_deviceid(server, &dlist->dev_id[i],
+ NULL, GFP_NOFS);
+ if (d)
+ nfs4_put_deviceid_node(d);
+ }
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(nfs4_deviceid_getdevicelist);
--
1.9.1


2014-08-12 10:57:11

by Boaz Harrosh

[permalink] [raw]
Subject: Re: pnfs: factor GETDEVICEINFO implementations

On 08/12/2014 12:52 PM, Christoph Hellwig wrote:
> On Tue, Aug 12, 2014 at 12:35:25PM +0300, Boaz Harrosh wrote:
>>> This is mostly done by moving well tested code from the file and block
>>> layout drivers to common code. I've only tested it using the block
>>> layout driver, so testing with the other layouts would be greatly
>>> appreciated. I'm mostly concerned about the object layout driver as
>>> it had significant differences and missing bug fixes compared to the
>>> others.
>>>
>> Hi Christoph
>>
>> Please say what BUGs have you seen in object-layout-drive?
>
> I've not seen actual bugs, but it's been missing various minor bug
> fixes that were applied to the file layout version.
>
>> Do you have this on a public tree somewhere? I would like to test it?
>
> I didn't bother for just those few patches, but I've created a "getdeviceinfo"
> branch at git://git.infradead.org/users/hch/pnfs.git
>
>

Thanks Christoph, I'll give it a go and report. (But not today)

Boaz


2014-08-12 12:33:55

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH 1/4] pnfs: factor GETDEVICEINFO implementations

On 08/12/2014 03:21 PM, Christoph Hellwig wrote:
> [Can you please trim your quotes? Quoting 700+ lines of a patch for
> a 30 line reply is completely unreasonable, and placing the reply in
> the middle of it is even worse. I will ignore mails ignoring the
> netiquette this blatantly in the future]
>

Yes sorry you are absolutely right, My bad, I usually do this, a moment
of spaciness.

> On Tue, Aug 12, 2014 at 02:36:55PM +0300, Boaz Harrosh wrote:
>>> + /*
>>> + * Use the session max response size as the basis for setting
>>> + * GETDEVICEINFO's maxcount
>>> + */
>>> + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
>>> + max_pages = nfs_page_array_len(0, max_resp_sz);
>>> + dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
>>> + __func__, server, max_resp_sz, max_pages);
>>> +
>>
>> This is an extremely too big an allocation for obj-lo (which has
>> a couple of embedded strings here). The all RPC can fit a single
>> page
>>
>> Should we put like a flag in struct pnfs_layoutdriver_type:
>>
>> if (server->pnfs_curr_ld->flags & PNFS_DEVINFO_SINGLE_PAGE) {
>> max_pages = 1;
>> max_resp_sz = PAGE_SIZE;
>> }
>>
>> This gives us so many extra allocation for storing one page pointer but for
>> the simplicity of the cleanup we can live with it.
>
> Sounds fine to me, but do you really have that many GETDEVICEINFO calls
> in object layout setups that it's worth the effort?
>

Panasas's biggest installation is like 1200 OSDs. With exofs I tested
with 300. They come in groups of like 9, each 9 devices is good for
2G of data, before you move to the next set. Each file has a randomized
set of devices, so a git clone would load the 300 devices easily.

> Another slightly cleaner option would be to have a max_deviceinfo_size
> field in the layout driver and cap the size by it.
>

Sure! max_deviceinfo_size would be even better, lets say that if it is
zero then max_resp_sz is used. (Easier for you)

Thanks
Boaz


2014-08-11 20:04:44

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 1/4] pnfs: factor GETDEVICEINFO implementations

Add support to the common pNFS core to issue GETDEVICEINFO calls on
a device ID cache miss. The code is taken from the well debugged
file layout implementation and calls out to the layoutdriver through
a new alloc_deviceid_node method. The calling conventions for
nfs4_find_get_deviceid are changed so that all information needed to
send a GETDEVICEINFO request is passed to the common code.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/nfs/filelayout/filelayout.c | 29 +++++---
fs/nfs/filelayout/filelayout.h | 7 +-
fs/nfs/filelayout/filelayoutdev.c | 108 ++--------------------------
fs/nfs/objlayout/objio_osd.c | 114 +++++++++++------------------
fs/nfs/objlayout/objlayout.c | 70 ------------------
fs/nfs/objlayout/objlayout.h | 5 --
fs/nfs/pnfs.h | 12 ++--
fs/nfs/pnfs_dev.c | 146 ++++++++++++++++++++++++++------------
8 files changed, 178 insertions(+), 313 deletions(-)

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 1359c4a..3136fc7 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -646,18 +646,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
}

/* find and reference the deviceid */
- d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
- NFS_SERVER(lo->plh_inode)->nfs_client, id);
- if (d == NULL) {
- dsaddr = filelayout_get_device_info(lo->plh_inode, id,
- lo->plh_lc_cred, gfp_flags);
- if (dsaddr == NULL)
- goto out;
- } else
- dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
+ d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
+ lo->plh_lc_cred, gfp_flags);
+ if (d == NULL)
+ goto out;
+
+ dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
/* Found deviceid is unavailable */
if (filelayout_test_devid_unavailable(&dsaddr->id_node))
- goto out_put;
+ goto out_put;

fl->dsaddr = dsaddr;

@@ -1367,6 +1364,17 @@ out:
cinfo->ds->ncommitting = 0;
return PNFS_ATTEMPTED;
}
+static struct nfs4_deviceid_node *
+filelayout_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+ struct nfs4_file_layout_dsaddr *dsaddr;
+
+ dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
+ if (!dsaddr)
+ return NULL;
+ return &dsaddr->id_node;
+}

static void
filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
@@ -1419,6 +1427,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
+ .alloc_deviceid_node = filelayout_alloc_deviceid_node,
.free_deviceid_node = filelayout_free_deveiceid_node,
};

diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index ffbddf2..7c9f800 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
u32 ds_idx);
+
+extern struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags);
extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
- struct rpc_cred *cred, gfp_t gfp_flags);

#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 48f8dcd..c1edb3a 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -484,8 +484,9 @@ out_err:
}

/* Decode opaque device data and return the result */
-static struct nfs4_file_layout_dsaddr*
-decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
{
int i;
u32 cnt, num;
@@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
dsaddr->stripe_indices = stripe_indices;
stripe_indices = NULL;
dsaddr->ds_num = num;
- nfs4_init_deviceid_node(&dsaddr->id_node,
- NFS_SERVER(ino)->pnfs_curr_ld,
- NFS_SERVER(ino)->nfs_client,
- &pdev->dev_id);
+ nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);

INIT_LIST_HEAD(&dsaddrs);

@@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)

mp_count = be32_to_cpup(p); /* multipath count */
for (j = 0; j < mp_count; j++) {
- da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
+ da = decode_ds_addr(server->nfs_client->cl_net,
&stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
@@ -637,102 +635,6 @@ out_err:
return NULL;
}

-/*
- * Decode the opaque device specified in 'dev' and add it to the cache of
- * available devices.
- */
-static struct nfs4_file_layout_dsaddr *
-decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
-{
- struct nfs4_deviceid_node *d;
- struct nfs4_file_layout_dsaddr *n, *new;
-
- new = decode_device(inode, dev, gfp_flags);
- if (!new) {
- printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
- __func__);
- return NULL;
- }
-
- d = nfs4_insert_deviceid_node(&new->id_node);
- n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
- if (n != new) {
- nfs4_fl_free_deviceid(new);
- return n;
- }
-
- return new;
-}
-
-/*
- * Retrieve the information for dev_id, add it to the list
- * of available devices, and return it.
- */
-struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode,
- struct nfs4_deviceid *dev_id,
- struct rpc_cred *cred,
- gfp_t gfp_flags)
-{
- struct pnfs_device *pdev = NULL;
- u32 max_resp_sz;
- int max_pages;
- struct page **pages = NULL;
- struct nfs4_file_layout_dsaddr *dsaddr = NULL;
- int rc, i;
- struct nfs_server *server = NFS_SERVER(inode);
-
- /*
- * Use the session max response size as the basis for setting
- * GETDEVICEINFO's maxcount
- */
- max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
- max_pages = nfs_page_array_len(0, max_resp_sz);
- dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
- __func__, inode, max_resp_sz, max_pages);
-
- pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
- if (pdev == NULL)
- return NULL;
-
- pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
- if (pages == NULL) {
- kfree(pdev);
- return NULL;
- }
- for (i = 0; i < max_pages; i++) {
- pages[i] = alloc_page(gfp_flags);
- if (!pages[i])
- goto out_free;
- }
-
- memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
- pdev->layout_type = LAYOUT_NFSV4_1_FILES;
- pdev->pages = pages;
- pdev->pgbase = 0;
- pdev->pglen = max_resp_sz;
- pdev->mincount = 0;
- pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
-
- rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
- dprintk("%s getdevice info returns %d\n", __func__, rc);
- if (rc)
- goto out_free;
-
- /*
- * Found new device, need to decode it and then add it to the
- * list of known devices for this mountpoint.
- */
- dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
-out_free:
- for (i = 0; i < max_pages; i++)
- __free_page(pages[i]);
- kfree(pages);
- kfree(pdev);
- dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
- return dsaddr;
-}
-
void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ae05278..da2c1c4 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
kfree(de);
}

-static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
- const struct nfs4_deviceid *d_id)
-{
- struct nfs4_deviceid_node *d;
- struct objio_dev_ent *de;
-
- d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
- if (!d)
- return NULL;
-
- de = container_of(d, struct objio_dev_ent, id_node);
- return de;
-}
-
-static struct objio_dev_ent *
-_dev_list_add(const struct nfs_server *nfss,
- const struct nfs4_deviceid *d_id, struct osd_dev *od,
- gfp_t gfp_flags)
-{
- struct nfs4_deviceid_node *d;
- struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
- struct objio_dev_ent *n;
-
- if (!de) {
- dprintk("%s: -ENOMEM od=%p\n", __func__, od);
- return NULL;
- }
-
- dprintk("%s: Adding od=%p\n", __func__, od);
- nfs4_init_deviceid_node(&de->id_node,
- nfss->pnfs_curr_ld,
- nfss->nfs_client,
- d_id);
- de->od.od = od;
-
- d = nfs4_insert_deviceid_node(&de->id_node);
- n = container_of(d, struct objio_dev_ent, id_node);
- if (n != de) {
- dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
- objio_free_deviceid_node(&de->id_node);
- de = n;
- }
-
- return de;
-}
-
struct objio_segment {
struct pnfs_layout_segment lseg;

@@ -130,29 +84,24 @@ struct objio_state {

/* Send and wait for a get_device_info of devices in the layout,
then look them up with the osd_initiator library */
-static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
- struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
- gfp_t gfp_flags)
+struct nfs4_deviceid_node *
+objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
{
struct pnfs_osd_deviceaddr *deviceaddr;
- struct objio_dev_ent *ode;
+ struct objio_dev_ent *ode = NULL;
struct osd_dev *od;
struct osd_dev_info odi;
bool retry_flag = true;
+ u32 *p;
int err;

- ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
- if (ode) {
- objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
- return 0;
- }
-
- err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
- if (unlikely(err)) {
- dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
- __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
- return err;
- }
+ deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
+ if (!deviceaddr)
+ return NULL;
+
+ p = page_address(pdev->pages[0]);
+ pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);

odi.systemid_len = deviceaddr->oda_systemid.len;
if (odi.systemid_len > sizeof(odi.systemid)) {
@@ -188,14 +137,24 @@ retry_lookup:
goto out;
}

- ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
- gfp_flags);
- objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
dprintk("Adding new dev_id(%llx:%llx)\n",
- _DEVID_LO(d_id), _DEVID_HI(d_id));
+ _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
+
+ ode = kzalloc(sizeof(*ode), gfp_flags);
+ if (!ode) {
+ dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+ goto out;
+ }
+
+ nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
+ kfree(deviceaddr);
+
+ ode->od.od = od;
+ return &ode->id_node;
+
out:
- objlayout_put_deviceinfo(deviceaddr);
- return err;
+ kfree(deviceaddr);
+ return NULL;
}

static void copy_single_comp(struct ore_components *oc, unsigned c,
@@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
struct xdr_stream *xdr,
gfp_t gfp_flags)
{
+ struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
struct objio_segment *objio_seg;
struct pnfs_osd_xdr_decode_layout_iter iter;
struct pnfs_osd_layout layout;
@@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
objio_seg->oc.first_dev = layout.olo_comps_index;
cur_comp = 0;
while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
+ struct nfs4_deviceid_node *d;
+ struct objio_dev_ent *ode;
+
copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
- err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
- &src_comp.oc_object_id.oid_device_id,
- gfp_flags);
- if (err)
+
+ d = nfs4_find_get_deviceid(server,
+ &src_comp.oc_object_id.oid_device_id,
+ pnfslay->plh_lc_cred, gfp_flags);
+ if (!d) {
+ err = -ENXIO;
goto err;
- ++cur_comp;
+ }
+
+ ode = container_of(d, struct objio_dev_ent, id_node);
+ objio_seg->oc.ods[cur_comp++] = &ode->od;
}
/* pnfs_osd_xdr_decode_layout_comp returns false on error */
if (unlikely(err))
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 697a16d..c89357c 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -574,76 +574,6 @@ loop_done:
dprintk("%s: Return\n", __func__);
}

-
-/*
- * Get Device Info API for io engines
- */
-struct objlayout_deviceinfo {
- struct page *page;
- struct pnfs_osd_deviceaddr da; /* This must be last */
-};
-
-/* Initialize and call nfs_getdeviceinfo, then decode and return a
- * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
- * should be called.
- */
-int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
- struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
- gfp_t gfp_flags)
-{
- struct objlayout_deviceinfo *odi;
- struct pnfs_device pd;
- struct page *page, **pages;
- u32 *p;
- int err;
-
- page = alloc_page(gfp_flags);
- if (!page)
- return -ENOMEM;
-
- pages = &page;
- pd.pages = pages;
-
- memcpy(&pd.dev_id, d_id, sizeof(*d_id));
- pd.layout_type = LAYOUT_OSD2_OBJECTS;
- pd.pages = &page;
- pd.pgbase = 0;
- pd.pglen = PAGE_SIZE;
- pd.mincount = 0;
- pd.maxcount = PAGE_SIZE;
-
- err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
- pnfslay->plh_lc_cred);
- dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
- if (err)
- goto err_out;
-
- p = page_address(page);
- odi = kzalloc(sizeof(*odi), gfp_flags);
- if (!odi) {
- err = -ENOMEM;
- goto err_out;
- }
- pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
- odi->page = page;
- *deviceaddr = &odi->da;
- return 0;
-
-err_out:
- __free_page(page);
- return err;
-}
-
-void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
-{
- struct objlayout_deviceinfo *odi = container_of(deviceaddr,
- struct objlayout_deviceinfo,
- da);
-
- __free_page(odi->page);
- kfree(odi);
-}
-
enum {
OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index fd13f1d..3a0828d 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
extern void objlayout_write_done(struct objlayout_io_res *oir,
ssize_t status, bool sync);

-extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
- struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
- gfp_t gfp_flags);
-extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
-
/*
* exported generic objects function vectors
*/
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 603f460..e145b79 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -125,6 +125,9 @@ struct pnfs_layoutdriver_type {
enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);

void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+ struct nfs4_deviceid_node * (*alloc_deviceid_node)
+ (struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags);

void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
struct xdr_stream *xdr,
@@ -259,11 +262,12 @@ struct nfs4_deviceid_node {
atomic_t ref;
};

-struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, struct rpc_cred *cred,
+ gfp_t gfp_mask);
void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
-void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
- const struct pnfs_layoutdriver_type *,
- const struct nfs_client *,
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
const struct nfs4_deviceid *);
struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6da209b..e3d4fa9 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -29,6 +29,9 @@
*/

#include <linux/export.h>
+#include <linux/nfs_fs.h>
+#include "nfs4session.h"
+#include "internal.h"
#include "pnfs.h"

#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -89,6 +92,71 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
return NULL;
}

+static struct nfs4_deviceid_node *
+nfs4_get_device_info(struct nfs_server *server,
+ const struct nfs4_deviceid *dev_id,
+ struct rpc_cred *cred, gfp_t gfp_flags)
+{
+ struct nfs4_deviceid_node *d = NULL;
+ struct pnfs_device *pdev = NULL;
+ struct page **pages = NULL;
+ u32 max_resp_sz;
+ int max_pages;
+ int rc, i;
+
+ /*
+ * Use the session max response size as the basis for setting
+ * GETDEVICEINFO's maxcount
+ */
+ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+ max_pages = nfs_page_array_len(0, max_resp_sz);
+ dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
+ __func__, server, max_resp_sz, max_pages);
+
+ pdev = kzalloc(sizeof(*pdev), gfp_flags);
+ if (!pdev)
+ return NULL;
+
+ pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
+ if (!pages)
+ goto out_free_pdev;
+
+ for (i = 0; i < max_pages; i++) {
+ pages[i] = alloc_page(gfp_flags);
+ if (!pages[i])
+ goto out_free_pages;
+ }
+
+ memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
+ pdev->layout_type = server->pnfs_curr_ld->id;
+ pdev->pages = pages;
+ pdev->pgbase = 0;
+ pdev->pglen = max_resp_sz;
+ pdev->mincount = 0;
+ pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
+
+ rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
+ dprintk("%s getdevice info returns %d\n", __func__, rc);
+ if (rc)
+ goto out_free_pages;
+
+ /*
+ * Found new device, need to decode it and then add it to the
+ * list of known devices for this mountpoint.
+ */
+ d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
+ gfp_flags);
+
+out_free_pages:
+ for (i = 0; i < max_pages; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+out_free_pdev:
+ kfree(pdev);
+ dprintk("<-- %s d %p\n", __func__, d);
+ return d;
+}
+
/*
* Lookup a deviceid in cache and get a reference count on it if found
*
@@ -96,14 +164,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
* @id deviceid to look up
*/
static struct nfs4_deviceid_node *
-_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *clp, const struct nfs4_deviceid *id,
- long hash)
+__nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, long hash)
{
struct nfs4_deviceid_node *d;

rcu_read_lock();
- d = _lookup_deviceid(ld, clp, id, hash);
+ d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
+ hash);
if (d != NULL)
atomic_inc(&d->ref);
rcu_read_unlock();
@@ -111,10 +179,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
}

struct nfs4_deviceid_node *
-nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *clp, const struct nfs4_deviceid *id)
+nfs4_find_get_deviceid(struct nfs_server *server,
+ const struct nfs4_deviceid *id, struct rpc_cred *cred,
+ gfp_t gfp_mask)
{
- return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+ long hash = nfs4_deviceid_hash(id);
+ struct nfs4_deviceid_node *d, *new;
+
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d)
+ return d;
+
+ new = nfs4_get_device_info(server, id, cred, gfp_mask);
+ if (!new)
+ return new;
+
+ spin_lock(&nfs4_deviceid_lock);
+ d = __nfs4_find_get_deviceid(server, id, hash);
+ if (d) {
+ spin_unlock(&nfs4_deviceid_lock);
+ server->pnfs_curr_ld->free_deviceid_node(new);
+ return d;
+ }
+ hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+ atomic_inc(&new->ref);
+ spin_unlock(&nfs4_deviceid_lock);
+
+ return new;
}
EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);

@@ -151,15 +242,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);

void
-nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
- const struct pnfs_layoutdriver_type *ld,
- const struct nfs_client *nfs_client,
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
const struct nfs4_deviceid *id)
{
INIT_HLIST_NODE(&d->node);
INIT_HLIST_NODE(&d->tmpnode);
- d->ld = ld;
- d->nfs_client = nfs_client;
+ d->ld = server->pnfs_curr_ld;
+ d->nfs_client = server->nfs_client;
d->flags = 0;
d->deviceid = *id;
atomic_set(&d->ref, 1);
@@ -167,39 +256,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);

/*
- * Uniquely initialize and insert a deviceid node into cache
- *
- * @new new deviceid node
- * Note that the caller must set up the following members:
- * new->ld
- * new->nfs_client
- * new->deviceid
- *
- * @ret the inserted node, if none found, otherwise, the found entry.
- */
-struct nfs4_deviceid_node *
-nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
-{
- struct nfs4_deviceid_node *d;
- long hash;
-
- spin_lock(&nfs4_deviceid_lock);
- hash = nfs4_deviceid_hash(&new->deviceid);
- d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
- if (d) {
- spin_unlock(&nfs4_deviceid_lock);
- return d;
- }
-
- hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
- spin_unlock(&nfs4_deviceid_lock);
- atomic_inc(&new->ref);
-
- return new;
-}
-EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
-
-/*
* Dereference a deviceid node and delete it when its reference count drops
* to zero.
*
--
1.9.1


2014-08-12 09:35:29

by Boaz Harrosh

[permalink] [raw]
Subject: Re: pnfs: factor GETDEVICEINFO implementations

On 08/11/2014 11:06 PM, Christoph Hellwig wrote:
> This series adds a common implementation of the GETDEVICELIST operation,
> and switches the block layout driver to use the generic device id cache.
>
> This is mostly done by moving well tested code from the file and block
> layout drivers to common code. I've only tested it using the block
> layout driver, so testing with the other layouts would be greatly
> appreciated. I'm mostly concerned about the object layout driver as
> it had significant differences and missing bug fixes compared to the
> others.
>
Hi Christoph

Please say what BUGs have you seen in object-layout-drive?

It should be noted that pnfs-obj's GETDEVICEINFO is very simple and will
never ever reach a full page. So the code was more simple, that's all.

Do you have this on a public tree somewhere? I would like to test it?

Thanks
Boaz

> Note that the last patch requires my previous block layout series, but
> the others should work fine without it.
>
> This work was sponsored by NetApp, Inc.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> .
>


2014-08-12 11:36:59

by Boaz Harrosh

[permalink] [raw]
Subject: Re: [PATCH 1/4] pnfs: factor GETDEVICEINFO implementations

On 08/11/2014 11:06 PM, Christoph Hellwig wrote:
> Add support to the common pNFS core to issue GETDEVICEINFO calls on
> a device ID cache miss. The code is taken from the well debugged
> file layout implementation and calls out to the layoutdriver through
> a new alloc_deviceid_node method. The calling conventions for
> nfs4_find_get_deviceid are changed so that all information needed to
> send a GETDEVICEINFO request is passed to the common code.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/nfs/filelayout/filelayout.c | 29 +++++---
> fs/nfs/filelayout/filelayout.h | 7 +-
> fs/nfs/filelayout/filelayoutdev.c | 108 ++--------------------------
> fs/nfs/objlayout/objio_osd.c | 114 +++++++++++------------------
> fs/nfs/objlayout/objlayout.c | 70 ------------------
> fs/nfs/objlayout/objlayout.h | 5 --
> fs/nfs/pnfs.h | 12 ++--
> fs/nfs/pnfs_dev.c | 146 ++++++++++++++++++++++++++------------
> 8 files changed, 178 insertions(+), 313 deletions(-)
>
> diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
> index 1359c4a..3136fc7 100644
> --- a/fs/nfs/filelayout/filelayout.c
> +++ b/fs/nfs/filelayout/filelayout.c
> @@ -646,18 +646,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
> }
>
> /* find and reference the deviceid */
> - d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
> - NFS_SERVER(lo->plh_inode)->nfs_client, id);
> - if (d == NULL) {
> - dsaddr = filelayout_get_device_info(lo->plh_inode, id,
> - lo->plh_lc_cred, gfp_flags);
> - if (dsaddr == NULL)
> - goto out;
> - } else
> - dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
> + d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
> + lo->plh_lc_cred, gfp_flags);
> + if (d == NULL)
> + goto out;
> +
> + dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
> /* Found deviceid is unavailable */
> if (filelayout_test_devid_unavailable(&dsaddr->id_node))
> - goto out_put;
> + goto out_put;
>
> fl->dsaddr = dsaddr;
>
> @@ -1367,6 +1364,17 @@ out:
> cinfo->ds->ncommitting = 0;
> return PNFS_ATTEMPTED;
> }
> +static struct nfs4_deviceid_node *
> +filelayout_alloc_deviceid_node(struct nfs_server *server,
> + struct pnfs_device *pdev, gfp_t gfp_flags)
> +{
> + struct nfs4_file_layout_dsaddr *dsaddr;
> +
> + dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
> + if (!dsaddr)
> + return NULL;
> + return &dsaddr->id_node;
> +}
>
> static void
> filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
> @@ -1419,6 +1427,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
> .commit_pagelist = filelayout_commit_pagelist,
> .read_pagelist = filelayout_read_pagelist,
> .write_pagelist = filelayout_write_pagelist,
> + .alloc_deviceid_node = filelayout_alloc_deviceid_node,
> .free_deviceid_node = filelayout_free_deveiceid_node,
> };
>
> diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
> index ffbddf2..7c9f800 100644
> --- a/fs/nfs/filelayout/filelayout.h
> +++ b/fs/nfs/filelayout/filelayout.h
> @@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
> u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
> struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
> u32 ds_idx);
> +
> +extern struct nfs4_file_layout_dsaddr *
> +nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
> + struct pnfs_device *pdev, gfp_t gfp_flags);
> extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
> extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
> -struct nfs4_file_layout_dsaddr *
> -filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
> - struct rpc_cred *cred, gfp_t gfp_flags);
>
> #endif /* FS_NFS_NFS4FILELAYOUT_H */
> diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
> index 48f8dcd..c1edb3a 100644
> --- a/fs/nfs/filelayout/filelayoutdev.c
> +++ b/fs/nfs/filelayout/filelayoutdev.c
> @@ -484,8 +484,9 @@ out_err:
> }
>
> /* Decode opaque device data and return the result */
> -static struct nfs4_file_layout_dsaddr*
> -decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
> +struct nfs4_file_layout_dsaddr *
> +nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
> + gfp_t gfp_flags)
> {
> int i;
> u32 cnt, num;
> @@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
> dsaddr->stripe_indices = stripe_indices;
> stripe_indices = NULL;
> dsaddr->ds_num = num;
> - nfs4_init_deviceid_node(&dsaddr->id_node,
> - NFS_SERVER(ino)->pnfs_curr_ld,
> - NFS_SERVER(ino)->nfs_client,
> - &pdev->dev_id);
> + nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
>
> INIT_LIST_HEAD(&dsaddrs);
>
> @@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
>
> mp_count = be32_to_cpup(p); /* multipath count */
> for (j = 0; j < mp_count; j++) {
> - da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
> + da = decode_ds_addr(server->nfs_client->cl_net,
> &stream, gfp_flags);
> if (da)
> list_add_tail(&da->da_node, &dsaddrs);
> @@ -637,102 +635,6 @@ out_err:
> return NULL;
> }
>
> -/*
> - * Decode the opaque device specified in 'dev' and add it to the cache of
> - * available devices.
> - */
> -static struct nfs4_file_layout_dsaddr *
> -decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
> -{
> - struct nfs4_deviceid_node *d;
> - struct nfs4_file_layout_dsaddr *n, *new;
> -
> - new = decode_device(inode, dev, gfp_flags);
> - if (!new) {
> - printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
> - __func__);
> - return NULL;
> - }
> -
> - d = nfs4_insert_deviceid_node(&new->id_node);
> - n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
> - if (n != new) {
> - nfs4_fl_free_deviceid(new);
> - return n;
> - }
> -
> - return new;
> -}
> -
> -/*
> - * Retrieve the information for dev_id, add it to the list
> - * of available devices, and return it.
> - */
> -struct nfs4_file_layout_dsaddr *
> -filelayout_get_device_info(struct inode *inode,
> - struct nfs4_deviceid *dev_id,
> - struct rpc_cred *cred,
> - gfp_t gfp_flags)
> -{
> - struct pnfs_device *pdev = NULL;
> - u32 max_resp_sz;
> - int max_pages;
> - struct page **pages = NULL;
> - struct nfs4_file_layout_dsaddr *dsaddr = NULL;
> - int rc, i;
> - struct nfs_server *server = NFS_SERVER(inode);
> -
> - /*
> - * Use the session max response size as the basis for setting
> - * GETDEVICEINFO's maxcount
> - */
> - max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
> - max_pages = nfs_page_array_len(0, max_resp_sz);
> - dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
> - __func__, inode, max_resp_sz, max_pages);
> -
> - pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
> - if (pdev == NULL)
> - return NULL;
> -
> - pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
> - if (pages == NULL) {
> - kfree(pdev);
> - return NULL;
> - }
> - for (i = 0; i < max_pages; i++) {
> - pages[i] = alloc_page(gfp_flags);
> - if (!pages[i])
> - goto out_free;
> - }
> -
> - memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
> - pdev->layout_type = LAYOUT_NFSV4_1_FILES;
> - pdev->pages = pages;
> - pdev->pgbase = 0;
> - pdev->pglen = max_resp_sz;
> - pdev->mincount = 0;
> - pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
> -
> - rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
> - dprintk("%s getdevice info returns %d\n", __func__, rc);
> - if (rc)
> - goto out_free;
> -
> - /*
> - * Found new device, need to decode it and then add it to the
> - * list of known devices for this mountpoint.
> - */
> - dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
> -out_free:
> - for (i = 0; i < max_pages; i++)
> - __free_page(pages[i]);
> - kfree(pages);
> - kfree(pdev);
> - dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
> - return dsaddr;
> -}
> -
> void
> nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
> {
> diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
> index ae05278..da2c1c4 100644
> --- a/fs/nfs/objlayout/objio_osd.c
> +++ b/fs/nfs/objlayout/objio_osd.c
> @@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
> kfree(de);
> }
>
> -static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
> - const struct nfs4_deviceid *d_id)
> -{
> - struct nfs4_deviceid_node *d;
> - struct objio_dev_ent *de;
> -
> - d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
> - if (!d)
> - return NULL;
> -
> - de = container_of(d, struct objio_dev_ent, id_node);
> - return de;
> -}
> -
> -static struct objio_dev_ent *
> -_dev_list_add(const struct nfs_server *nfss,
> - const struct nfs4_deviceid *d_id, struct osd_dev *od,
> - gfp_t gfp_flags)
> -{
> - struct nfs4_deviceid_node *d;
> - struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
> - struct objio_dev_ent *n;
> -
> - if (!de) {
> - dprintk("%s: -ENOMEM od=%p\n", __func__, od);
> - return NULL;
> - }
> -
> - dprintk("%s: Adding od=%p\n", __func__, od);
> - nfs4_init_deviceid_node(&de->id_node,
> - nfss->pnfs_curr_ld,
> - nfss->nfs_client,
> - d_id);
> - de->od.od = od;
> -
> - d = nfs4_insert_deviceid_node(&de->id_node);
> - n = container_of(d, struct objio_dev_ent, id_node);
> - if (n != de) {
> - dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
> - objio_free_deviceid_node(&de->id_node);
> - de = n;
> - }
> -
> - return de;
> -}
> -
> struct objio_segment {
> struct pnfs_layout_segment lseg;
>
> @@ -130,29 +84,24 @@ struct objio_state {
>
> /* Send and wait for a get_device_info of devices in the layout,
> then look them up with the osd_initiator library */
> -static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
> - struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id,
> - gfp_t gfp_flags)
> +struct nfs4_deviceid_node *
> +objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
> + gfp_t gfp_flags)
> {
> struct pnfs_osd_deviceaddr *deviceaddr;
> - struct objio_dev_ent *ode;
> + struct objio_dev_ent *ode = NULL;
> struct osd_dev *od;
> struct osd_dev_info odi;
> bool retry_flag = true;
> + u32 *p;
> int err;
>
> - ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
> - if (ode) {
> - objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
> - return 0;
> - }
> -
> - err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
> - if (unlikely(err)) {
> - dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
> - __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
> - return err;
> - }
> + deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
> + if (!deviceaddr)
> + return NULL;
> +
> + p = page_address(pdev->pages[0]);
> + pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
>
> odi.systemid_len = deviceaddr->oda_systemid.len;
> if (odi.systemid_len > sizeof(odi.systemid)) {
> @@ -188,14 +137,24 @@ retry_lookup:
> goto out;
> }
>
> - ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
> - gfp_flags);
> - objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
> dprintk("Adding new dev_id(%llx:%llx)\n",
> - _DEVID_LO(d_id), _DEVID_HI(d_id));
> + _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
> +
> + ode = kzalloc(sizeof(*ode), gfp_flags);
> + if (!ode) {
> + dprintk("%s: -ENOMEM od=%p\n", __func__, od);
> + goto out;
> + }
> +
> + nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
> + kfree(deviceaddr);
> +
> + ode->od.od = od;
> + return &ode->id_node;
> +
> out:
> - objlayout_put_deviceinfo(deviceaddr);
> - return err;
> + kfree(deviceaddr);
> + return NULL;
> }
>
> static void copy_single_comp(struct ore_components *oc, unsigned c,
> @@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
> struct xdr_stream *xdr,
> gfp_t gfp_flags)
> {
> + struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
> struct objio_segment *objio_seg;
> struct pnfs_osd_xdr_decode_layout_iter iter;
> struct pnfs_osd_layout layout;
> @@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
> objio_seg->oc.first_dev = layout.olo_comps_index;
> cur_comp = 0;
> while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
> + struct nfs4_deviceid_node *d;
> + struct objio_dev_ent *ode;
> +
> copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
> - err = objio_devices_lookup(pnfslay, objio_seg, cur_comp,
> - &src_comp.oc_object_id.oid_device_id,
> - gfp_flags);
> - if (err)
> +
> + d = nfs4_find_get_deviceid(server,
> + &src_comp.oc_object_id.oid_device_id,
> + pnfslay->plh_lc_cred, gfp_flags);
> + if (!d) {
> + err = -ENXIO;
> goto err;
> - ++cur_comp;
> + }
> +
> + ode = container_of(d, struct objio_dev_ent, id_node);
> + objio_seg->oc.ods[cur_comp++] = &ode->od;
> }
> /* pnfs_osd_xdr_decode_layout_comp returns false on error */
> if (unlikely(err))
> diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
> index 697a16d..c89357c 100644
> --- a/fs/nfs/objlayout/objlayout.c
> +++ b/fs/nfs/objlayout/objlayout.c
> @@ -574,76 +574,6 @@ loop_done:
> dprintk("%s: Return\n", __func__);
> }
>
> -
> -/*
> - * Get Device Info API for io engines
> - */
> -struct objlayout_deviceinfo {
> - struct page *page;
> - struct pnfs_osd_deviceaddr da; /* This must be last */
> -};
> -
> -/* Initialize and call nfs_getdeviceinfo, then decode and return a
> - * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
> - * should be called.
> - */
> -int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
> - struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
> - gfp_t gfp_flags)
> -{
> - struct objlayout_deviceinfo *odi;
> - struct pnfs_device pd;
> - struct page *page, **pages;
> - u32 *p;
> - int err;
> -
> - page = alloc_page(gfp_flags);
> - if (!page)
> - return -ENOMEM;
> -
> - pages = &page;
> - pd.pages = pages;
> -
> - memcpy(&pd.dev_id, d_id, sizeof(*d_id));
> - pd.layout_type = LAYOUT_OSD2_OBJECTS;
> - pd.pages = &page;
> - pd.pgbase = 0;
> - pd.pglen = PAGE_SIZE;
> - pd.mincount = 0;
> - pd.maxcount = PAGE_SIZE;
> -
> - err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
> - pnfslay->plh_lc_cred);
> - dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
> - if (err)
> - goto err_out;
> -
> - p = page_address(page);
> - odi = kzalloc(sizeof(*odi), gfp_flags);
> - if (!odi) {
> - err = -ENOMEM;
> - goto err_out;
> - }
> - pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
> - odi->page = page;
> - *deviceaddr = &odi->da;
> - return 0;
> -
> -err_out:
> - __free_page(page);
> - return err;
> -}
> -
> -void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
> -{
> - struct objlayout_deviceinfo *odi = container_of(deviceaddr,
> - struct objlayout_deviceinfo,
> - da);
> -
> - __free_page(odi->page);
> - kfree(odi);
> -}
> -
> enum {
> OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
> OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
> diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
> index fd13f1d..3a0828d 100644
> --- a/fs/nfs/objlayout/objlayout.h
> +++ b/fs/nfs/objlayout/objlayout.h
> @@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
> extern void objlayout_write_done(struct objlayout_io_res *oir,
> ssize_t status, bool sync);
>
> -extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
> - struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
> - gfp_t gfp_flags);
> -extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
> -
> /*
> * exported generic objects function vectors
> */
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 603f460..e145b79 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -125,6 +125,9 @@ struct pnfs_layoutdriver_type {
> enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
>
> void (*free_deviceid_node) (struct nfs4_deviceid_node *);
> + struct nfs4_deviceid_node * (*alloc_deviceid_node)
> + (struct nfs_server *server, struct pnfs_device *pdev,
> + gfp_t gfp_flags);
>
> void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
> struct xdr_stream *xdr,
> @@ -259,11 +262,12 @@ struct nfs4_deviceid_node {
> atomic_t ref;
> };
>
> -struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
> +struct nfs4_deviceid_node *
> +nfs4_find_get_deviceid(struct nfs_server *server,
> + const struct nfs4_deviceid *id, struct rpc_cred *cred,
> + gfp_t gfp_mask);
> void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
> -void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
> - const struct pnfs_layoutdriver_type *,
> - const struct nfs_client *,
> +void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
> const struct nfs4_deviceid *);
> struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
> bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
> diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
> index 6da209b..e3d4fa9 100644
> --- a/fs/nfs/pnfs_dev.c
> +++ b/fs/nfs/pnfs_dev.c
> @@ -29,6 +29,9 @@
> */
>
> #include <linux/export.h>
> +#include <linux/nfs_fs.h>
> +#include "nfs4session.h"
> +#include "internal.h"
> #include "pnfs.h"
>
> #define NFSDBG_FACILITY NFSDBG_PNFS
> @@ -89,6 +92,71 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
> return NULL;
> }
>
> +static struct nfs4_deviceid_node *
> +nfs4_get_device_info(struct nfs_server *server,
> + const struct nfs4_deviceid *dev_id,
> + struct rpc_cred *cred, gfp_t gfp_flags)
> +{
> + struct nfs4_deviceid_node *d = NULL;
> + struct pnfs_device *pdev = NULL;
> + struct page **pages = NULL;
> + u32 max_resp_sz;
> + int max_pages;
> + int rc, i;
> +
> + /*
> + * Use the session max response size as the basis for setting
> + * GETDEVICEINFO's maxcount
> + */
> + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
> + max_pages = nfs_page_array_len(0, max_resp_sz);
> + dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
> + __func__, server, max_resp_sz, max_pages);
> +

This is an extremely too big an allocation for obj-lo (which has
a couple of embedded strings here). The all RPC can fit a single
page

Should we put like a flag in struct pnfs_layoutdriver_type:

if (server->pnfs_curr_ld->flags & PNFS_DEVINFO_SINGLE_PAGE) {
max_pages = 1;
max_resp_sz = PAGE_SIZE;
}

This gives us so many extra allocation for storing one page pointer but for
the simplicity of the cleanup we can live with it.

What do you say? For obj-lo a device_id is a light wait resource and it can
have hundreds/thousand of them. Not like file-lo and block-lo that have few, up to
tens, devices. Do we want this optimization?

(Did not review the all thing yet, will do later)
Thanks
Boaz

> + pdev = kzalloc(sizeof(*pdev), gfp_flags);
> + if (!pdev)
> + return NULL;
> +
> + pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
> + if (!pages)
> + goto out_free_pdev;
> +
> + for (i = 0; i < max_pages; i++) {
> + pages[i] = alloc_page(gfp_flags);
> + if (!pages[i])
> + goto out_free_pages;
> + }
> +
> + memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
> + pdev->layout_type = server->pnfs_curr_ld->id;
> + pdev->pages = pages;
> + pdev->pgbase = 0;
> + pdev->pglen = max_resp_sz;
> + pdev->mincount = 0;
> + pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
> +
> + rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
> + dprintk("%s getdevice info returns %d\n", __func__, rc);
> + if (rc)
> + goto out_free_pages;
> +
> + /*
> + * Found new device, need to decode it and then add it to the
> + * list of known devices for this mountpoint.
> + */
> + d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
> + gfp_flags);
> +
> +out_free_pages:
> + for (i = 0; i < max_pages; i++)
> + __free_page(pages[i]);
> + kfree(pages);
> +out_free_pdev:
> + kfree(pdev);
> + dprintk("<-- %s d %p\n", __func__, d);
> + return d;
> +}
> +
> /*
> * Lookup a deviceid in cache and get a reference count on it if found
> *
> @@ -96,14 +164,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
> * @id deviceid to look up
> */
> static struct nfs4_deviceid_node *
> -_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
> - const struct nfs_client *clp, const struct nfs4_deviceid *id,
> - long hash)
> +__nfs4_find_get_deviceid(struct nfs_server *server,
> + const struct nfs4_deviceid *id, long hash)
> {
> struct nfs4_deviceid_node *d;
>
> rcu_read_lock();
> - d = _lookup_deviceid(ld, clp, id, hash);
> + d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
> + hash);
> if (d != NULL)
> atomic_inc(&d->ref);
> rcu_read_unlock();
> @@ -111,10 +179,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
> }
>
> struct nfs4_deviceid_node *
> -nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
> - const struct nfs_client *clp, const struct nfs4_deviceid *id)
> +nfs4_find_get_deviceid(struct nfs_server *server,
> + const struct nfs4_deviceid *id, struct rpc_cred *cred,
> + gfp_t gfp_mask)
> {
> - return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
> + long hash = nfs4_deviceid_hash(id);
> + struct nfs4_deviceid_node *d, *new;
> +
> + d = __nfs4_find_get_deviceid(server, id, hash);
> + if (d)
> + return d;
> +
> + new = nfs4_get_device_info(server, id, cred, gfp_mask);
> + if (!new)
> + return new;
> +
> + spin_lock(&nfs4_deviceid_lock);
> + d = __nfs4_find_get_deviceid(server, id, hash);
> + if (d) {
> + spin_unlock(&nfs4_deviceid_lock);
> + server->pnfs_curr_ld->free_deviceid_node(new);
> + return d;
> + }
> + hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
> + atomic_inc(&new->ref);
> + spin_unlock(&nfs4_deviceid_lock);
> +
> + return new;
> }
> EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
>
> @@ -151,15 +242,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
> EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
>
> void
> -nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
> - const struct pnfs_layoutdriver_type *ld,
> - const struct nfs_client *nfs_client,
> +nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
> const struct nfs4_deviceid *id)
> {
> INIT_HLIST_NODE(&d->node);
> INIT_HLIST_NODE(&d->tmpnode);
> - d->ld = ld;
> - d->nfs_client = nfs_client;
> + d->ld = server->pnfs_curr_ld;
> + d->nfs_client = server->nfs_client;
> d->flags = 0;
> d->deviceid = *id;
> atomic_set(&d->ref, 1);
> @@ -167,39 +256,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
> EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
>
> /*
> - * Uniquely initialize and insert a deviceid node into cache
> - *
> - * @new new deviceid node
> - * Note that the caller must set up the following members:
> - * new->ld
> - * new->nfs_client
> - * new->deviceid
> - *
> - * @ret the inserted node, if none found, otherwise, the found entry.
> - */
> -struct nfs4_deviceid_node *
> -nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
> -{
> - struct nfs4_deviceid_node *d;
> - long hash;
> -
> - spin_lock(&nfs4_deviceid_lock);
> - hash = nfs4_deviceid_hash(&new->deviceid);
> - d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
> - if (d) {
> - spin_unlock(&nfs4_deviceid_lock);
> - return d;
> - }
> -
> - hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
> - spin_unlock(&nfs4_deviceid_lock);
> - atomic_inc(&new->ref);
> -
> - return new;
> -}
> -EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
> -
> -/*
> * Dereference a deviceid node and delete it when its reference count drops
> * to zero.
> *
>