2010-04-26 16:18:16

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 0/3] pNFS generic device ID cache version 3


This patch set implements a shared RCU device ID cache servicing multiple
mounts of a single layout type per meta data server (struct nfs_client).

Device id's are referenced by layout segments which hold a pointer to the
nfs4_deviceid struct.

An hlist is used for the cache due to the large number of devices used by
the object and block layout drivers.

Note that nfs4_deviceid_hash() is the same as the NFSD opaque_hash function.
Perhaps they should be shared.

0001-SQUASHME-pnfs_submit-generic-device-ID-cache.patch
0002-SQUASHME-pnfs_submit-fix-multiple-mount-set_pnfs_lay.patch
0003-SQUASHME-pnfs-submit-file-layout-driver-generic-devi.patch

These patches apply to the 2.6.34-rc3 pnfs-submit branch.

Testing:
-------
CONFIG_NFS_V4_1 set:

NFSv4.1/pNFS mounts:
Connectathon tests pass against GFS2/pNFS with a single AUTH_SYS mount, a double
AUTH_SYS mount, and an AUTH_SYS and AUTH_GSS/KRB5 mount (which creates
two superblocks under a struct nfs_client and both share the device id cache).

NFSv4.0 mount;
Connectathon tests pass

Did not test with multiple device ID's. I will create a mulitple device ID
test with the pynfs file layout server.

CONFIG_NFS_V4_1 not set:

NFSv4.0 mount: Connectathon tests pass.

-->Andy



2010-04-26 16:18:17

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache

From: Andy Adamson <[email protected]>

A shared RCU device ID cache servicing multiple mounts of a single layout type
per meta data server (struct nfs_client).

Device IDs of type deviceid4 are required by all layout types, long lived and
read at each I/O. They are added to the deviceid cache at first reference by
a layout via GETDEVICEINFO and (currently) are only removed at umount.

Reference count the device ID cache for each mounted file system
in the initialize_mountpoint layoutdriver_io_operation.

Dereference the device id cache on file system in the uninitialize_mountpoint
layoutdriver_io_operation called at umount

Each layoutsegment assigns a pointer and takes a reference to the
nfs4_deviceid structure identified by the layout deviceid.
This is so that there are no deviceid lookups for the normal I/O path.

Even thought required by all layouttypes, the deviceid is not exposed in the
LAYOUTGET4res but is instead hidden in the opaque layouttype4.

Therefore, each layout type alloc_lseg calls nfs4_set_layout_deviceid,
and free_lseg calls nfs4_unset_layout_deviceid.

While the file layout driver will not cache very many deviceid's, the object
and block layout drivers could cache 100's for a large installation.
Use an hlist.

Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/pnfs.c | 167 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/nfs4_pnfs.h | 50 +++++++++++++
include/linux/nfs_fs_sb.h | 1 +
3 files changed, 218 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 91572aa..bf906cc 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -45,6 +45,7 @@
#include <linux/nfs4.h>
#include <linux/pnfs_xdr.h>
#include <linux/nfs4_pnfs.h>
+#include <linux/rculist.h>

#include "internal.h"
#include "nfs4_fs.h"
@@ -2296,3 +2297,169 @@ struct pnfs_client_operations pnfs_ops = {

EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
EXPORT_SYMBOL(pnfs_register_layoutdriver);
+
+
+/* Device ID cache. Supports one layout type per struct nfs_client */
+int
+nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
+ void (*free_callback)(struct kref *))
+{
+ struct nfs4_deviceid_cache *c;
+
+ c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
+ if (!c)
+ return -ENOMEM;
+ spin_lock(&clp->cl_lock);
+ if (clp->cl_devid_cache != NULL) {
+ kref_get(&clp->cl_devid_cache->dc_kref);
+ spin_unlock(&clp->cl_lock);
+ dprintk("%s [kref [%d]]\n", __func__,
+ atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
+ kfree(c);
+ } else {
+ int i;
+
+ spin_lock_init(&c->dc_lock);
+ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++)
+ INIT_HLIST_HEAD(&c->dc_deviceids[i]);
+ kref_init(&c->dc_kref);
+ c->dc_free_callback = free_callback;
+ clp->cl_devid_cache = c;
+ spin_unlock(&clp->cl_lock);
+ dprintk("%s [new]\n", __func__);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid *d)
+{
+ INIT_HLIST_NODE(&d->de_node);
+ kref_init(&d->de_kref);
+}
+EXPORT_SYMBOL(nfs4_init_deviceid_node);
+
+/* Called from layoutdriver_io_operations->alloc_lseg */
+void
+nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d)
+{
+ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount));
+ l->deviceid = d;
+ kref_get(&d->de_kref);
+}
+EXPORT_SYMBOL(nfs4_set_layout_deviceid);
+
+/* Called from layoutdriver_io_operations->free_lseg */
+void
+nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l,
+ struct nfs4_deviceid *d,
+ void (*free_callback)(struct kref *))
+{
+ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount));
+ l->deviceid = NULL;
+ kref_put(&d->de_kref, free_callback);
+}
+EXPORT_SYMBOL(nfs4_unset_layout_deviceid);
+
+struct nfs4_deviceid *
+nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
+{
+ struct nfs4_deviceid *d;
+ struct hlist_node *n;
+ long hash = nfs4_deviceid_hash(id);
+
+ dprintk("--> %s hash %ld\n", __func__, hash);
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+ if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
+ rcu_read_unlock();
+ return d;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+EXPORT_SYMBOL(nfs4_find_deviceid);
+
+/*
+ * Add or kref_get a deviceid.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+struct nfs4_deviceid *
+nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
+{
+ struct nfs4_deviceid *d;
+ struct hlist_node *n;
+ long hash = nfs4_deviceid_hash(&new->de_id);
+
+ dprintk("--> %s hash %ld\n", __func__, hash);
+ spin_lock(&c->dc_lock);
+ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+ if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
+ spin_unlock(&c->dc_lock);
+ dprintk("%s [discard]\n", __func__);
+ c->dc_free_callback(&new->de_kref);
+ return d;
+ }
+ }
+ hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
+ spin_unlock(&c->dc_lock);
+ dprintk("%s [new]\n", __func__);
+ return new;
+}
+EXPORT_SYMBOL(nfs4_add_deviceid);
+
+static int
+nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash)
+{
+ struct nfs4_deviceid *d;
+ struct hlist_node *n;
+
+ dprintk("--> %s hash %ld\n", __func__, hash);
+ spin_lock(&c->dc_lock);
+ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
+ hlist_del_rcu(&d->de_node);
+ spin_unlock(&c->dc_lock);
+ synchronize_rcu();
+ dprintk("%s [%d]\n", __func__,
+ atomic_read(&d->de_kref.refcount));
+ kref_put(&d->de_kref, c->dc_free_callback);
+ return 1;
+ }
+ spin_unlock(&c->dc_lock);
+ return 0;
+}
+
+static void
+nfs4_free_deviceid_cache(struct kref *kref)
+{
+ struct nfs4_deviceid_cache *cache =
+ container_of(kref, struct nfs4_deviceid_cache, dc_kref);
+ int more;
+ long i;
+
+ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) {
+ more = 1;
+ while (more)
+ more = nfs4_remove_deviceid(cache, i);
+ }
+ kfree(cache);
+}
+
+void
+nfs4_put_deviceid_cache(struct nfs_client *clp)
+{
+ struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache;
+ int refcount;
+
+ dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
+ spin_lock(&clp->cl_lock);
+ refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount);
+ if (refcount == 1)
+ clp->cl_devid_cache = NULL;
+ spin_unlock(&clp->cl_lock);
+ dprintk("%s [%d]\n", __func__, refcount);
+ kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
+}
+EXPORT_SYMBOL(nfs4_put_deviceid_cache);
diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
index 3caac60..3b7aeb7 100644
--- a/include/linux/nfs4_pnfs.h
+++ b/include/linux/nfs4_pnfs.h
@@ -106,6 +106,7 @@ struct pnfs_layout_segment {
struct kref kref;
bool valid;
struct pnfs_layout_type *layout;
+ struct nfs4_deviceid *deviceid;
u8 ld_data[]; /* layout driver private data */
};

@@ -275,6 +276,55 @@ struct pnfs_devicelist {
struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
};

+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS 5
+#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
+
+static inline u32
+nfs4_deviceid_hash(struct pnfs_deviceid *id)
+{
+ unsigned char *cptr = (unsigned char *)id->data;
+ unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE;
+ u32 x = 0;
+
+ while (nbytes--) {
+ x *= 37;
+ x += *cptr++;
+ }
+ return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+
+struct nfs4_deviceid_cache {
+ spinlock_t dc_lock;
+ struct kref dc_kref;
+ void (*dc_free_callback)(struct kref *);
+ struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
+};
+
+/* Device ID cache node */
+struct nfs4_deviceid {
+ struct hlist_node de_node;
+ struct pnfs_deviceid de_id;
+ struct kref de_kref;
+};
+
+extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
+ void (*free_callback)(struct kref *));
+extern void nfs4_put_deviceid_cache(struct nfs_client *);
+extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
+extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
+ struct pnfs_deviceid *);
+extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid_cache *,
+ struct nfs4_deviceid *);
+extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *,
+ struct nfs4_deviceid *);
+extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *,
+ struct nfs4_deviceid *,
+ void (*free_callback)(struct kref *));
+
/* pNFS client callback functions.
* These operations allow the layout driver to access pNFS client
* specific information or call pNFS client->server operations.
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 8522461..ef2e18e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -87,6 +87,7 @@ struct nfs_client {
u32 cl_exchange_flags;
struct nfs4_session *cl_session; /* sharred session */
struct list_head cl_lo_inodes; /* Inodes having layouts */
+ struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
#endif /* CONFIG_NFS_V4_1 */

#ifdef CONFIG_NFS_FSCACHE
--
1.6.6


2010-04-26 16:18:17

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver

From: Andy Adamson <[email protected]>

The same struct nfs_server can enter set_pnfs_layoutdriver for mounts that
share a super block. Don't initialize a pnfs mountpoint more than once.

Don't set the pnfs_curr_ld until the pnfs mountpoint initialization succeeds

Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/pnfs.c | 15 ++++++++++-----
1 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bf906cc..a3e8231 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -215,20 +215,25 @@ set_pnfs_layoutdriver(struct super_block *sb, struct nfs_fh *fh, u32 id)
struct pnfs_mount_type *mt;
struct nfs_server *server = NFS_SB(sb);

+ if (server->pnfs_curr_ld)
+ return;
+
if (id > 0 && find_pnfs(id, &mod)) {
- dprintk("%s: Setting pNFS module\n", __func__);
- server->pnfs_curr_ld = mod->pnfs_ld_type;
- mt = server->pnfs_curr_ld->ld_io_ops->initialize_mountpoint(
+ mt = mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint(
sb, fh);
if (!mt) {
printk(KERN_ERR "%s: Error initializing mount point "
"for layout driver %u. ", __func__, id);
goto out_err;
}
- /* Layout driver succeeded in initializing mountpoint */
+ /*
+ * Layout driver succeeded in initializing mountpoint
+ * and has taken a reference on the nfs_client cl_devid_cache
+ */
+ server->pnfs_curr_ld = mod->pnfs_ld_type;
server->pnfs_mountid = mt;
- /* Set the rpc_ops */
server->nfs_client->rpc_ops = &pnfs_v4_clientops;
+ dprintk("%s: pNFS module for %u set\n", __func__, id);
return;
}

--
1.6.6


2010-04-26 16:18:17

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 3/3] SQUASHME pnfs-submit: file layout driver generic device ID cache

From: Andy Adamson <[email protected]>

Replace the per superblock deviceid cache with the generic deviceid cache.

Embed struct nfs4_deviceid into struct nfs4_file_layout_dsaddr, the file layout
specific deviceid structure. Provide a free_deviceid_callback.

Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/client.c | 1 +
fs/nfs/nfs4filelayout.c | 54 ++++++------
fs/nfs/nfs4filelayout.h | 12 +--
fs/nfs/nfs4filelayoutdev.c | 199 +++++++++-----------------------------------
4 files changed, 71 insertions(+), 195 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e13ccb7..887d71e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -38,6 +38,7 @@
#include <net/ipv6.h>
#include <linux/nfs_xdr.h>
#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nfs4_pnfs.h>

#include <asm/system.h>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 0530b59..79b9df2 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -76,17 +76,11 @@ filelayout_initialize_mountpoint(struct super_block *sb, struct nfs_fh *fh)
{
struct filelayout_mount_type *fl_mt;
struct pnfs_mount_type *mt;
- int status;

fl_mt = kmalloc(sizeof(struct filelayout_mount_type), GFP_KERNEL);
if (!fl_mt)
goto error_ret;

- /* Initialize nfs4 file layout specific device list structure */
- fl_mt->hlist = kmalloc(sizeof(struct nfs4_pnfs_dev_hlist), GFP_KERNEL);
- if (!fl_mt->hlist)
- goto cleanup_fl_mt;
-
mt = kmalloc(sizeof(struct pnfs_mount_type), GFP_KERNEL);
if (!mt)
goto cleanup_fl_mt;
@@ -94,11 +88,11 @@ filelayout_initialize_mountpoint(struct super_block *sb, struct nfs_fh *fh)
fl_mt->fl_sb = sb;
mt->mountid = (void *)fl_mt;

- status = nfs4_pnfs_devlist_init(fl_mt->hlist);
- if (status)
+ if (nfs4_alloc_init_deviceid_cache(NFS_SB(sb)->nfs_client,
+ nfs4_fl_free_deviceid_callback))
goto cleanup_mt;

- dprintk("%s: device list has been initialized successfully\n",
+ dprintk("%s: deviceid cache has been initialized successfully\n",
__func__);
return mt;

@@ -106,11 +100,10 @@ cleanup_mt: ;
kfree(mt);

cleanup_fl_mt: ;
- kfree(fl_mt->hlist);
kfree(fl_mt);

error_ret: ;
- printk(KERN_WARNING "%s: device list could not be initialized\n",
+ printk(KERN_WARNING "%s: deviceid cache could not be initialized\n",
__func__);

return NULL;
@@ -123,13 +116,11 @@ filelayout_uninitialize_mountpoint(struct pnfs_mount_type *mountid)
{
struct filelayout_mount_type *fl_mt = NULL;

+ dprintk("--> %s\n", __func__);
if (mountid) {
fl_mt = (struct filelayout_mount_type *)mountid->mountid;
-
- if (fl_mt != NULL) {
- nfs4_pnfs_devlist_destroy(fl_mt->hlist);
- kfree(fl_mt);
- }
+ nfs4_put_deviceid_cache(NFS_SB(fl_mt->fl_sb)->nfs_client);
+ kfree(fl_mt);
kfree(mountid);
}
return 0;
@@ -381,8 +372,7 @@ filelayout_check_layout(struct pnfs_layout_type *lo,
struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo));

dprintk("--> %s\n", __func__);
- dsaddr = nfs4_pnfs_device_item_find(FILE_MT(PNFS_INODE(lo))->hlist,
- &fl->dev_id);
+ dsaddr = nfs4_pnfs_device_item_find(nfss->nfs_client, &fl->dev_id);
if (dsaddr == NULL) {
dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id);
if (dsaddr == NULL) {
@@ -421,13 +411,17 @@ filelayout_check_layout(struct pnfs_layout_type *lo,
dprintk("%s Stripe unit (%u) not aligned with rsize %u wsize %u\n",
__func__, fl->stripe_unit, nfss->ds_rsize, nfss->ds_wsize);
}
+
+ /* reference the device */
+ nfs4_set_layout_deviceid(lseg, &dsaddr->deviceid);
+
status = 0;
out:
dprintk("--> %s returns %d\n", __func__, status);
return status;
}

-static void filelayout_free_lseg(struct pnfs_layout_segment *lseg);
+static void _filelayout_free_lseg(struct pnfs_layout_segment *lseg);
static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl);

/* Decode layout and store in layoutid. Overwrite any existing layout
@@ -512,6 +506,7 @@ filelayout_alloc_lseg(struct pnfs_layout_type *layoutid,
struct pnfs_layout_segment *lseg;
int rc;

+ dprintk("--> %s\n", __func__);
lseg = kzalloc(sizeof(struct pnfs_layout_segment) +
sizeof(struct nfs4_filelayout_segment), GFP_KERNEL);
if (!lseg)
@@ -520,7 +515,7 @@ filelayout_alloc_lseg(struct pnfs_layout_type *layoutid,
rc = filelayout_set_layout(flo, LSEG_LD_DATA(lseg), lgr);

if (rc != 0 || filelayout_check_layout(layoutid, lseg)) {
- filelayout_free_lseg(lseg);
+ _filelayout_free_lseg(lseg);
lseg = NULL;
}
return lseg;
@@ -537,12 +532,21 @@ static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
}

static void
-filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+_filelayout_free_lseg(struct pnfs_layout_segment *lseg)
{
filelayout_free_fh_array(LSEG_LD_DATA(lseg));
kfree(lseg);
}

+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ dprintk("--> %s\n", __func__);
+ nfs4_unset_layout_deviceid(lseg, lseg->deviceid,
+ nfs4_fl_free_deviceid_callback);
+ _filelayout_free_lseg(lseg);
+}
+
/*
* Allocate a new nfs_write_data struct and initialize
*/
@@ -618,12 +622,8 @@ filelayout_commit(struct pnfs_layout_type *layoutid, int sync,
stripesz = filelayout_get_stripesize(layoutid);
dprintk("%s stripesize %Zd\n", __func__, stripesz);

- dsaddr = nfs4_pnfs_device_item_find(FILE_MT(data->inode)->hlist,
- &nfslay->dev_id);
- if (dsaddr == NULL) {
- data->pdata.pnfs_error = -EIO;
- goto out;
- }
+ dsaddr = container_of(data->pdata.lseg->deviceid,
+ struct nfs4_file_layout_dsaddr, deviceid);

INIT_LIST_HEAD(&head);
INIT_LIST_HEAD(&head2);
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 12498a2..fbf307c 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -43,8 +43,7 @@ struct nfs4_pnfs_ds {
};

struct nfs4_file_layout_dsaddr {
- struct hlist_node hash_node; /* nfs4_pnfs_dev_hlist dev_list */
- struct pnfs_deviceid dev_id;
+ struct nfs4_deviceid deviceid;
u32 stripe_count;
u8 *stripe_indices;
u32 ds_num;
@@ -86,15 +85,13 @@ struct nfs4_filelayout {

struct filelayout_mount_type {
struct super_block *fl_sb;
- struct nfs4_pnfs_dev_hlist *hlist;
};

extern struct pnfs_client_operations *pnfs_callback_ops;

+extern void nfs4_fl_free_deviceid_callback(struct kref *);
extern void print_ds(struct nfs4_pnfs_ds *ds);
char *deviceid_fmt(const struct pnfs_deviceid *dev_id);
-int nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist);
-void nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist);
int nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
loff_t offset,
size_t count,
@@ -102,9 +99,8 @@ int nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
u32 filelayout_dserver_get_index(loff_t offset,
struct nfs4_file_layout_dsaddr *di,
struct nfs4_filelayout_segment *layout);
-struct nfs4_file_layout_dsaddr *
-nfs4_pnfs_device_item_find(struct nfs4_pnfs_dev_hlist *hlist,
- struct pnfs_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_pnfs_device_item_find(struct nfs_client *, struct pnfs_deviceid *dev_id);
struct nfs4_file_layout_dsaddr *
get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id);

diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 045c204..61a3381 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -45,6 +45,7 @@

#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/nfs4_pnfs.h>
#include <linux/pnfs_xdr.h>
#include "nfs4filelayout.h"
#include "internal.h"
@@ -98,42 +99,6 @@ deviceid_fmt(const struct pnfs_deviceid *dev_id)
return buf;
}

-unsigned long
-_deviceid_hash(const struct pnfs_deviceid *dev_id)
-{
- unsigned char *cptr = (unsigned char *)dev_id->data;
- unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE;
- u64 x = 0;
-
- while (nbytes--) {
- x *= 37;
- x += *cptr++;
- }
- return x & NFS4_PNFS_DEV_HASH_MASK;
-}
-
-/* Assumes lock is held */
-static inline struct nfs4_file_layout_dsaddr *
-_device_lookup(struct nfs4_pnfs_dev_hlist *hlist,
- const struct pnfs_deviceid *dev_id)
-{
- unsigned long hash;
- struct hlist_node *np;
-
- dprintk("_device_lookup: dev_id=%s\n", deviceid_fmt(dev_id));
-
- hash = _deviceid_hash(dev_id);
-
- hlist_for_each(np, &hlist->dev_list[hash]) {
- struct nfs4_file_layout_dsaddr *dsaddr;
- dsaddr = hlist_entry(np, struct nfs4_file_layout_dsaddr,
- hash_node);
- if (!memcmp(&dsaddr->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE))
- return dsaddr;
- }
- return NULL;
-}
-
/* nfs4_ds_cache_lock is held */
static inline struct nfs4_pnfs_ds *
_data_server_lookup(u32 ip_addr, u32 port)
@@ -152,22 +117,6 @@ _data_server_lookup(u32 ip_addr, u32 port)
return NULL;
}

-
-/* Assumes lock is held */
-static inline void
-_device_add(struct nfs4_pnfs_dev_hlist *hlist,
- struct nfs4_file_layout_dsaddr *dsaddr)
-{
- unsigned long hash;
-
- dprintk("_device_add: dev_id=%s ds_list:\n",
- deviceid_fmt(&dsaddr->dev_id));
- print_ds_list(dsaddr);
-
- hash = _deviceid_hash(&dsaddr->dev_id);
- hlist_add_head(&dsaddr->hash_node, &hlist->dev_list[hash]);
-}
-
/* Create an rpc to the data server defined in 'dev_list' */
static int
nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
@@ -269,118 +218,47 @@ out_put:
static void
destroy_ds(struct nfs4_pnfs_ds *ds)
{
+ dprintk("--> %s\n", __func__);
+ print_ds(ds);
+
if (ds->ds_clp)
nfs_put_client(ds->ds_clp);
kfree(ds);
}

-/* Assumes lock is NOT held */
static void
-nfs4_pnfs_device_destroy(struct nfs4_file_layout_dsaddr *dsaddr,
- struct nfs4_pnfs_dev_hlist *hlist)
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
struct nfs4_pnfs_ds *ds;
- LIST_HEAD(release);
int i;

- if (!dsaddr)
- return;
-
- dprintk("%s: dev_id=%s\ndev_list:\n", __func__,
- deviceid_fmt(&dsaddr->dev_id));
- print_ds_list(dsaddr);
-
- write_lock(&hlist->dev_lock);
- hlist_del_init(&dsaddr->hash_node);
+ dprintk("%s: device id=%s\n", __func__,
+ deviceid_fmt(&dsaddr->deviceid.de_id));

for (i = 0; i < dsaddr->ds_num; i++) {
ds = dsaddr->ds_list[i];
if (ds != NULL) {
- /* if we are last user - move to release list */
if (atomic_dec_and_lock(&ds->ds_count,
&nfs4_ds_cache_lock)) {
list_del_init(&ds->ds_node);
spin_unlock(&nfs4_ds_cache_lock);
- list_add(&ds->ds_node, &release);
+ destroy_ds(ds);
}
}
}
- write_unlock(&hlist->dev_lock);
- while (!list_empty(&release)) {
- ds = list_entry(release.next, struct nfs4_pnfs_ds, ds_node);
- list_del(&ds->ds_node);
- destroy_ds(ds);
- }
+ kfree(dsaddr->stripe_indices);
kfree(dsaddr);
}

-int
-nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist)
-{
- int i;
-
- rwlock_init(&hlist->dev_lock);
-
- for (i = 0; i < NFS4_PNFS_DEV_HASH_SIZE; i++) {
- INIT_HLIST_HEAD(&hlist->dev_list[i]);
- }
-
- return 0;
-}
-
-/* De-alloc all devices for a mount point. This is called in
- * nfs4_kill_super.
- */
void
-nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist)
+nfs4_fl_free_deviceid_callback(struct kref *kref)
{
- int i;
+ struct nfs4_deviceid *device =
+ container_of(kref, struct nfs4_deviceid, de_kref);
+ struct nfs4_file_layout_dsaddr *dsaddr =
+ container_of(device, struct nfs4_file_layout_dsaddr, deviceid);

- if (hlist == NULL)
- return;
-
- /* No lock held, as synchronization should occur at upper levels */
- for (i = 0; i < NFS4_PNFS_DEV_HASH_SIZE; i++) {
- struct hlist_node *np, *next;
-
- hlist_for_each_safe(np, next, &hlist->dev_list[i]) {
- struct nfs4_file_layout_dsaddr *dsaddr;
- dsaddr = hlist_entry(np,
- struct nfs4_file_layout_dsaddr,
- hash_node);
- /* nfs4_pnfs_device_destroy grabs hlist->dev_lock */
- nfs4_pnfs_device_destroy(dsaddr, hlist);
- }
- }
-}
-
-/*
- * Add the device to the list of available devices for this mount point.
- * The * rpc client is created during first I/O.
- */
-static int
-nfs4_pnfs_device_add(struct filelayout_mount_type *mt,
- struct nfs4_file_layout_dsaddr *dsaddr)
-{
- struct nfs4_file_layout_dsaddr *tmp_dsaddr;
- struct nfs4_pnfs_dev_hlist *hlist = mt->hlist;
-
- dprintk("nfs4_pnfs_device_add\n");
-
- /* Write lock, do lookup again, and then add device */
- write_lock(&hlist->dev_lock);
- tmp_dsaddr = _device_lookup(hlist, &dsaddr->dev_id);
- if (tmp_dsaddr == NULL)
- _device_add(hlist, dsaddr);
- write_unlock(&hlist->dev_lock);
-
- /* Cleanup, if device was recently added */
- if (tmp_dsaddr != NULL) {
- dprintk(" device found, not adding (after creation)\n");
- nfs4_pnfs_device_destroy(dsaddr, hlist);
- }
-
- return 0;
+ nfs4_fl_free_deviceid(dsaddr);
}

static void
@@ -514,7 +392,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
dsaddr->stripe_count = cnt;
dsaddr->ds_num = num;

- memcpy(&dsaddr->dev_id, &pdev->dev_id, NFS4_PNFS_DEVICEID4_SIZE);
+ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id,
+ NFS4_PNFS_DEVICEID4_SIZE);

/* Go back an read stripe indices */
p = indicesp;
@@ -553,37 +432,40 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
}
}
}
+ nfs4_init_deviceid_node(&dsaddr->deviceid);
+
return dsaddr;

out_err_free:
- nfs4_pnfs_device_destroy(dsaddr, FILE_MT(ino)->hlist);
+ nfs4_fl_free_deviceid(dsaddr);
out_err:
dprintk("%s ERROR: returning NULL\n", __func__);
return NULL;
}

-/* Decode the opaque device specified in 'dev'
- * and add it to the list of available devices for this
- * mount point.
- * Must at some point be followed up with nfs4_pnfs_device_destroy
+/*
+ * Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices.
+ * If the deviceid is already cached, nfs4_add_deviceid will return
+ * a pointer to the cached struct and throw away the new.
*/
static struct nfs4_file_layout_dsaddr*
decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
{
struct nfs4_file_layout_dsaddr *dsaddr;
+ struct nfs4_deviceid *d;

dsaddr = decode_device(inode, dev);
if (!dsaddr) {
- printk(KERN_WARNING "%s: Could not decode device\n",
+ printk(KERN_WARNING "%s: Could not decode or add device\n",
__func__);
- nfs4_pnfs_device_destroy(dsaddr, FILE_MT(inode)->hlist);
return NULL;
}

- if (nfs4_pnfs_device_add(FILE_MT(inode), dsaddr))
- return NULL;
+ d = nfs4_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+ &dsaddr->deviceid);

- return dsaddr;
+ return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
}

/* Retrieve the information for dev_id, add it to the list
@@ -658,16 +540,15 @@ out_free:
}

struct nfs4_file_layout_dsaddr *
-nfs4_pnfs_device_item_find(struct nfs4_pnfs_dev_hlist *hlist,
- struct pnfs_deviceid *dev_id)
+nfs4_pnfs_device_item_find(struct nfs_client *clp, struct pnfs_deviceid *id)
{
- struct nfs4_file_layout_dsaddr *dsaddr;
+ struct nfs4_deviceid *d;

- read_lock(&hlist->dev_lock);
- dsaddr = _device_lookup(hlist, dev_id);
- read_unlock(&hlist->dev_lock);
-
- return dsaddr;
+ d = nfs4_find_deviceid(clp->cl_devid_cache, id);
+ dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__,
+ deviceid_fmt(id), d);
+ return (d == NULL) ? NULL :
+ container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
}

/* Want res = ((offset / layout->stripe_unit) % dsaddr->stripe_count)
@@ -705,10 +586,8 @@ nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
if (!layout)
return 1;

- dsaddr = nfs4_pnfs_device_item_find(FILE_MT(inode)->hlist,
- &layout->dev_id);
- if (dsaddr == NULL)
- return 1;
+ dsaddr = container_of(lseg->deviceid, struct nfs4_file_layout_dsaddr,
+ deviceid);

stripe_idx = filelayout_dserver_get_index(offset, dsaddr, layout);

--
1.6.6


2010-05-03 11:48:43

by Benny Halevy

[permalink] [raw]
Subject: Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache

On Apr. 26, 2010, 19:18 +0300, [email protected] wrote:
> From: Andy Adamson <[email protected]>
>
> A shared RCU device ID cache servicing multiple mounts of a single layout type
> per meta data server (struct nfs_client).
>
> Device IDs of type deviceid4 are required by all layout types, long lived and
> read at each I/O. They are added to the deviceid cache at first reference by
> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>
> Reference count the device ID cache for each mounted file system
> in the initialize_mountpoint layoutdriver_io_operation.
>
> Dereference the device id cache on file system in the uninitialize_mountpoint
> layoutdriver_io_operation called at umount
>
> Each layoutsegment assigns a pointer and takes a reference to the
> nfs4_deviceid structure identified by the layout deviceid.
> This is so that there are no deviceid lookups for the normal I/O path.
>
> Even thought required by all layouttypes, the deviceid is not exposed in the
> LAYOUTGET4res but is instead hidden in the opaque layouttype4.
>
> Therefore, each layout type alloc_lseg calls nfs4_set_layout_deviceid,
> and free_lseg calls nfs4_unset_layout_deviceid.
>
> While the file layout driver will not cache very many deviceid's, the object
> and block layout drivers could cache 100's for a large installation.
> Use an hlist.
>
> Signed-off-by: Andy Adamson <[email protected]>
> ---
> fs/nfs/pnfs.c | 167 +++++++++++++++++++++++++++++++++++++++++++++
> include/linux/nfs4_pnfs.h | 50 +++++++++++++
> include/linux/nfs_fs_sb.h | 1 +
> 3 files changed, 218 insertions(+), 0 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 91572aa..bf906cc 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -45,6 +45,7 @@
> #include <linux/nfs4.h>
> #include <linux/pnfs_xdr.h>
> #include <linux/nfs4_pnfs.h>
> +#include <linux/rculist.h>
>
> #include "internal.h"
> #include "nfs4_fs.h"
> @@ -2296,3 +2297,169 @@ struct pnfs_client_operations pnfs_ops = {
>
> EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
> EXPORT_SYMBOL(pnfs_register_layoutdriver);
> +
> +
> +/* Device ID cache. Supports one layout type per struct nfs_client */
> +int
> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
> + void (*free_callback)(struct kref *))
> +{
> + struct nfs4_deviceid_cache *c;
> +
> + c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
> + if (!c)
> + return -ENOMEM;
> + spin_lock(&clp->cl_lock);
> + if (clp->cl_devid_cache != NULL) {
> + kref_get(&clp->cl_devid_cache->dc_kref);
> + spin_unlock(&clp->cl_lock);
> + dprintk("%s [kref [%d]]\n", __func__,
> + atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
> + kfree(c);
> + } else {
> + int i;
> +
> + spin_lock_init(&c->dc_lock);
> + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++)
> + INIT_HLIST_HEAD(&c->dc_deviceids[i]);
> + kref_init(&c->dc_kref);
> + c->dc_free_callback = free_callback;
> + clp->cl_devid_cache = c;
> + spin_unlock(&clp->cl_lock);
> + dprintk("%s [new]\n", __func__);
> + }
> + return 0;
> +}
> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
> +
> +void
> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
> +{
> + INIT_HLIST_NODE(&d->de_node);
> + kref_init(&d->de_kref);
> +}
> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
> +
> +/* Called from layoutdriver_io_operations->alloc_lseg */
> +void
> +nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d)
> +{
> + dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount));
> + l->deviceid = d;
> + kref_get(&d->de_kref);
> +}
> +EXPORT_SYMBOL(nfs4_set_layout_deviceid);
> +
> +/* Called from layoutdriver_io_operations->free_lseg */
> +void
> +nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l,
> + struct nfs4_deviceid *d,
> + void (*free_callback)(struct kref *))
> +{
> + dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount));
> + l->deviceid = NULL;
> + kref_put(&d->de_kref, free_callback);
> +}
> +EXPORT_SYMBOL(nfs4_unset_layout_deviceid);
> +
> +struct nfs4_deviceid *
> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
> +{
> + struct nfs4_deviceid *d;
> + struct hlist_node *n;
> + long hash = nfs4_deviceid_hash(id);
> +
> + dprintk("--> %s hash %ld\n", __func__, hash);
> + rcu_read_lock();
> + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
> + if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
> + rcu_read_unlock();
> + return d;
> + }
> + }
> + rcu_read_unlock();
> + return NULL;
> +}
> +EXPORT_SYMBOL(nfs4_find_deviceid);
> +
> +/*
> + * Add or kref_get a deviceid.
> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
> + */
> +struct nfs4_deviceid *
> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
> +{
> + struct nfs4_deviceid *d;
> + struct hlist_node *n;
> + long hash = nfs4_deviceid_hash(&new->de_id);
> +
> + dprintk("--> %s hash %ld\n", __func__, hash);
> + spin_lock(&c->dc_lock);
> + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
> + if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
> + spin_unlock(&c->dc_lock);
> + dprintk("%s [discard]\n", __func__);
> + c->dc_free_callback(&new->de_kref);
> + return d;
> + }
> + }
> + hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
> + spin_unlock(&c->dc_lock);
> + dprintk("%s [new]\n", __func__);
> + return new;
> +}
> +EXPORT_SYMBOL(nfs4_add_deviceid);
> +
> +static int
> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash)
> +{
> + struct nfs4_deviceid *d;
> + struct hlist_node *n;
> +
> + dprintk("--> %s hash %ld\n", __func__, hash);
> + spin_lock(&c->dc_lock);
> + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
> + hlist_del_rcu(&d->de_node);
> + spin_unlock(&c->dc_lock);
> + synchronize_rcu();
> + dprintk("%s [%d]\n", __func__,
> + atomic_read(&d->de_kref.refcount));
> + kref_put(&d->de_kref, c->dc_free_callback);
> + return 1;
> + }
> + spin_unlock(&c->dc_lock);
> + return 0;
> +}
> +
> +static void
> +nfs4_free_deviceid_cache(struct kref *kref)
> +{
> + struct nfs4_deviceid_cache *cache =
> + container_of(kref, struct nfs4_deviceid_cache, dc_kref);
> + int more;
> + long i;
> +
> + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) {
> + more = 1;
> + while (more)
> + more = nfs4_remove_deviceid(cache, i);

Andy, this can be simplified to

while (nfs4_remove_deviceid(cache, i))
;

If ok with you, I'll make this change upon merging.

Benny

> + }
> + kfree(cache);
> +}
> +
> +void
> +nfs4_put_deviceid_cache(struct nfs_client *clp)
> +{
> + struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache;
> + int refcount;
> +
> + dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
> + spin_lock(&clp->cl_lock);
> + refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount);
> + if (refcount == 1)
> + clp->cl_devid_cache = NULL;
> + spin_unlock(&clp->cl_lock);
> + dprintk("%s [%d]\n", __func__, refcount);
> + kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
> +}
> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
> index 3caac60..3b7aeb7 100644
> --- a/include/linux/nfs4_pnfs.h
> +++ b/include/linux/nfs4_pnfs.h
> @@ -106,6 +106,7 @@ struct pnfs_layout_segment {
> struct kref kref;
> bool valid;
> struct pnfs_layout_type *layout;
> + struct nfs4_deviceid *deviceid;
> u8 ld_data[]; /* layout driver private data */
> };
>
> @@ -275,6 +276,55 @@ struct pnfs_devicelist {
> struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
> };
>
> +/*
> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
> + */
> +#define NFS4_DEVICE_ID_HASH_BITS 5
> +#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
> +#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
> +
> +static inline u32
> +nfs4_deviceid_hash(struct pnfs_deviceid *id)
> +{
> + unsigned char *cptr = (unsigned char *)id->data;
> + unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE;
> + u32 x = 0;
> +
> + while (nbytes--) {
> + x *= 37;
> + x += *cptr++;
> + }
> + return x & NFS4_DEVICE_ID_HASH_MASK;
> +}
> +
> +struct nfs4_deviceid_cache {
> + spinlock_t dc_lock;
> + struct kref dc_kref;
> + void (*dc_free_callback)(struct kref *);
> + struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
> +};
> +
> +/* Device ID cache node */
> +struct nfs4_deviceid {
> + struct hlist_node de_node;
> + struct pnfs_deviceid de_id;
> + struct kref de_kref;
> +};
> +
> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
> + void (*free_callback)(struct kref *));
> +extern void nfs4_put_deviceid_cache(struct nfs_client *);
> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
> + struct pnfs_deviceid *);
> +extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid_cache *,
> + struct nfs4_deviceid *);
> +extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *,
> + struct nfs4_deviceid *);
> +extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *,
> + struct nfs4_deviceid *,
> + void (*free_callback)(struct kref *));
> +
> /* pNFS client callback functions.
> * These operations allow the layout driver to access pNFS client
> * specific information or call pNFS client->server operations.
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 8522461..ef2e18e 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -87,6 +87,7 @@ struct nfs_client {
> u32 cl_exchange_flags;
> struct nfs4_session *cl_session; /* sharred session */
> struct list_head cl_lo_inodes; /* Inodes having layouts */
> + struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
> #endif /* CONFIG_NFS_V4_1 */
>
> #ifdef CONFIG_NFS_FSCACHE


2010-05-03 13:57:59

by Andy Adamson

[permalink] [raw]
Subject: Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache

On Mon, May 3, 2010 at 7:48 AM, Benny Halevy <[email protected]> wrot=
e:
> On Apr. 26, 2010, 19:18 +0300, [email protected] wrote:
>> From: Andy Adamson <[email protected]>
>>
>> A shared RCU device ID cache servicing multiple mounts of a single l=
ayout type
>> per meta data server (struct nfs_client).
>>
>> Device IDs of type deviceid4 are required by all layout types, long =
lived and
>> read at each I/O. =A0They are added to the deviceid cache at first r=
eference by
>> a layout via GETDEVICEINFO and (currently) are only removed at umoun=
t.
>>
>> Reference count the device ID cache for each mounted file system
>> in the initialize_mountpoint layoutdriver_io_operation.
>>
>> Dereference the device id cache on file system in the uninitialize_m=
ountpoint
>> layoutdriver_io_operation called at umount
>>
>> Each layoutsegment assigns a pointer and takes a reference to the
>> nfs4_deviceid structure identified by the layout deviceid.
>> This is so that there are no deviceid lookups for the normal I/O pat=
h.
>>
>> Even thought required by all layouttypes, the deviceid is not expose=
d in the
>> LAYOUTGET4res but is instead hidden in the opaque layouttype4.
>>
>> Therefore, each layout type alloc_lseg calls nfs4_set_layout_devicei=
d,
>> and free_lseg calls nfs4_unset_layout_deviceid.
>>
>> While the file layout driver will not cache very many deviceid's, th=
e object
>> and block layout drivers could cache 100's for a large installation.
>> Use an hlist.
>>
>> Signed-off-by: Andy Adamson <[email protected]>
>> ---
>> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0167 ++++++++++++++++++=
+++++++++++++++++++++++++++
>> =A0include/linux/nfs4_pnfs.h | =A0 50 +++++++++++++
>> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
>> =A03 files changed, 218 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index 91572aa..bf906cc 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -45,6 +45,7 @@
>> =A0#include <linux/nfs4.h>
>> =A0#include <linux/pnfs_xdr.h>
>> =A0#include <linux/nfs4_pnfs.h>
>> +#include <linux/rculist.h>
>>
>> =A0#include "internal.h"
>> =A0#include "nfs4_fs.h"
>> @@ -2296,3 +2297,169 @@ struct pnfs_client_operations pnfs_ops =3D {
>>
>> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
>> +
>> +
>> +/* Device ID cache. Supports one layout type per struct nfs_client =
*/
>> +int
>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callback)(s=
truct kref *))
>> +{
>> + =A0 =A0 struct nfs4_deviceid_cache *c;
>> +
>> + =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERN=
EL);
>> + =A0 =A0 if (!c)
>> + =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
>> + =A0 =A0 spin_lock(&clp->cl_lock);
>> + =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_kref);
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func__,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->cl_devid=
_cache->dc_kref.refcount));
>> + =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
>> + =A0 =A0 } else {
>> + =A0 =A0 =A0 =A0 =A0 =A0 int i;
>> +
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 for (i =3D 0; i < NFS4_DEVICE_ID_HASH_SIZE=
; i++)
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_HLIST_HEAD(&c->dc_dev=
iceids[i]);
>> + =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
>> + =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callback;
>> + =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
>> + =A0 =A0 }
>> + =A0 =A0 return 0;
>> +}
>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>> +
>> +void
>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>> +{
>> + =A0 =A0 INIT_HLIST_NODE(&d->de_node);
>> + =A0 =A0 kref_init(&d->de_kref);
>> +}
>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>> +
>> +/* Called from layoutdriver_io_operations->alloc_lseg */
>> +void
>> +nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4=
_deviceid *d)
>> +{
>> + =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.ref=
count));
>> + =A0 =A0 l->deviceid =3D d;
>> + =A0 =A0 kref_get(&d->de_kref);
>> +}
>> +EXPORT_SYMBOL(nfs4_set_layout_deviceid);
>> +
>> +/* Called from layoutdriver_io_operations->free_lseg */
>> +void
>> +nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0struct nfs4_devicei=
d *d,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callbac=
k)(struct kref *))
>> +{
>> + =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.ref=
count));
>> + =A0 =A0 l->deviceid =3D NULL;
>> + =A0 =A0 kref_put(&d->de_kref, free_callback);
>> +}
>> +EXPORT_SYMBOL(nfs4_unset_layout_deviceid);
>> +
>> +struct nfs4_deviceid *
>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_devic=
eid *id)
>> +{
>> + =A0 =A0 struct nfs4_deviceid *d;
>> + =A0 =A0 struct hlist_node *n;
>> + =A0 =A0 long hash =3D nfs4_deviceid_hash(id);
>> +
>> + =A0 =A0 dprintk("--> %s hash %ld\n", __func__, hash);
>> + =A0 =A0 rcu_read_lock();
>> + =A0 =A0 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_=
node) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVIC=
EID4_SIZE)) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>> + =A0 =A0 =A0 =A0 =A0 =A0 }
>> + =A0 =A0 }
>> + =A0 =A0 rcu_read_unlock();
>> + =A0 =A0 return NULL;
>> +}
>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>> +
>> +/*
>> + * Add or kref_get a deviceid.
>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found,=
discard new
>> + */
>> +struct nfs4_deviceid *
>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_device=
id *new)
>> +{
>> + =A0 =A0 struct nfs4_deviceid *d;
>> + =A0 =A0 struct hlist_node *n;
>> + =A0 =A0 long hash =3D nfs4_deviceid_hash(&new->de_id);
>> +
>> + =A0 =A0 dprintk("--> %s hash %ld\n", __func__, hash);
>> + =A0 =A0 spin_lock(&c->dc_lock);
>> + =A0 =A0 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_=
node) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id, NFS4_P=
NFS_DEVICEID4_SIZE)) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discard]\n", =
__func__);
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&new->=
de_kref);
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
>> + =A0 =A0 =A0 =A0 =A0 =A0 }
>> + =A0 =A0 }
>> + =A0 =A0 hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
>> + =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 dprintk("%s [new]\n", __func__);
>> + =A0 =A0 return new;
>> +}
>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>> +
>> +static int
>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash)
>> +{
>> + =A0 =A0 struct nfs4_deviceid *d;
>> + =A0 =A0 struct hlist_node *n;
>> +
>> + =A0 =A0 dprintk("--> %s hash %ld\n", __func__, hash);
>> + =A0 =A0 spin_lock(&c->dc_lock);
>> + =A0 =A0 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_=
node) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 hlist_del_rcu(&d->de_node);
>> + =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
>> + =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [%d]\n", __func__,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&d->de_kref.re=
fcount));
>> + =A0 =A0 =A0 =A0 =A0 =A0 kref_put(&d->de_kref, c->dc_free_callback)=
;
>> + =A0 =A0 =A0 =A0 =A0 =A0 return 1;
>> + =A0 =A0 }
>> + =A0 =A0 spin_unlock(&c->dc_lock);
>> + =A0 =A0 return 0;
>> +}
>> +
>> +static void
>> +nfs4_free_deviceid_cache(struct kref *kref)
>> +{
>> + =A0 =A0 struct nfs4_deviceid_cache *cache =3D
>> + =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_deviceid_ca=
che, dc_kref);
>> + =A0 =A0 int more;
>> + =A0 =A0 long i;
>> +
>> + =A0 =A0 for (i =3D 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 more =3D 1;
>> + =A0 =A0 =A0 =A0 =A0 =A0 while (more)
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_devic=
eid(cache, i);
>
> Andy, this can be simplified to
>
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0while (nfs4_remove_deviceid(cache, i))
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0;
>
> If ok with you, I'll make this change upon merging.

Yes - looks fine, thanks.

-->Andy

>
> Benny
>
>> + =A0 =A0 }
>> + =A0 =A0 kfree(cache);
>> +}
>> +
>> +void
>> +nfs4_put_deviceid_cache(struct nfs_client *clp)
>> +{
>> + =A0 =A0 struct nfs4_deviceid_cache *tmp =3D clp->cl_devid_cache;
>> + =A0 =A0 int refcount;
>> +
>> + =A0 =A0 dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_de=
vid_cache);
>> + =A0 =A0 spin_lock(&clp->cl_lock);
>> + =A0 =A0 refcount =3D atomic_read(&clp->cl_devid_cache->dc_kref.ref=
count);
>> + =A0 =A0 if (refcount =3D=3D 1)
>> + =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D NULL;
>> + =A0 =A0 spin_unlock(&clp->cl_lock);
>> + =A0 =A0 dprintk("%s [%d]\n", __func__, refcount);
>> + =A0 =A0 kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
>> +}
>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>> index 3caac60..3b7aeb7 100644
>> --- a/include/linux/nfs4_pnfs.h
>> +++ b/include/linux/nfs4_pnfs.h
>> @@ -106,6 +106,7 @@ struct pnfs_layout_segment {
>> =A0 =A0 =A0 struct kref kref;
>> =A0 =A0 =A0 bool valid;
>> =A0 =A0 =A0 struct pnfs_layout_type *layout;
>> + =A0 =A0 struct nfs4_deviceid *deviceid;
>> =A0 =A0 =A0 u8 ld_data[]; =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 /* lay=
out driver private data */
>> =A0};
>>
>> @@ -275,6 +276,55 @@ struct pnfs_devicelist {
>> =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDEVLIST_=
MAXNUM];
>> =A0};
>>
>> +/*
>> + * Device ID RCU cache. A device ID is unique per client ID and lay=
out type.
>> + */
>> +#define NFS4_DEVICE_ID_HASH_BITS =A0 =A0 5
>> +#define NFS4_DEVICE_ID_HASH_SIZE =A0 =A0 (1 << NFS4_DEVICE_ID_HASH_=
BITS)
>> +#define NFS4_DEVICE_ID_HASH_MASK =A0 =A0 (NFS4_DEVICE_ID_HASH_SIZE =
- 1)
>> +
>> +static inline u32
>> +nfs4_deviceid_hash(struct pnfs_deviceid *id)
>> +{
>> + =A0 =A0 unsigned char *cptr =3D (unsigned char *)id->data;
>> + =A0 =A0 unsigned int nbytes =3D NFS4_PNFS_DEVICEID4_SIZE;
>> + =A0 =A0 u32 x =3D 0;
>> +
>> + =A0 =A0 while (nbytes--) {
>> + =A0 =A0 =A0 =A0 =A0 =A0 x *=3D 37;
>> + =A0 =A0 =A0 =A0 =A0 =A0 x +=3D *cptr++;
>> + =A0 =A0 }
>> + =A0 =A0 return x & NFS4_DEVICE_ID_HASH_MASK;
>> +}
>> +
>> +struct nfs4_deviceid_cache {
>> + =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
>> + =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
>> + =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_free_call=
back)(struct kref *);
>> + =A0 =A0 struct hlist_head =A0 =A0 =A0 dc_deviceids[NFS4_DEVICE_ID_=
HASH_SIZE];
>> +};
>> +
>> +/* Device ID cache node */
>> +struct nfs4_deviceid {
>> + =A0 =A0 struct hlist_node =A0 =A0 =A0 de_node;
>> + =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
>> + =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 de_kref;
>> +};
>> +
>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void (*fre=
e_callback)(struct kref *));
>> +extern void nfs4_put_deviceid_cache(struct nfs_client *);
>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_devicei=
d_cache *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct pnf=
s_deviceid *);
>> +extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid=
_cache *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct nfs=
4_deviceid *);
>> +extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct nfs=
4_deviceid *);
>> +extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *=
,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct nfs=
4_deviceid *,
>> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void (*fre=
e_callback)(struct kref *));
>> +
>> =A0/* pNFS client callback functions.
>> =A0 * These operations allow the layout driver to access pNFS client
>> =A0 * specific information or call pNFS client->server operations.
>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>> index 8522461..ef2e18e 100644
>> --- a/include/linux/nfs_fs_sb.h
>> +++ b/include/linux/nfs_fs_sb.h
>> @@ -87,6 +87,7 @@ struct nfs_client {
>> =A0 =A0 =A0 u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_exchange_=
flags;
>> =A0 =A0 =A0 struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/* sharr=
ed session */
>> =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 /* Ino=
des having layouts */
>> + =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS device=
id cache */
>> =A0#endif /* CONFIG_NFS_V4_1 */
>>
>> =A0#ifdef CONFIG_NFS_FSCACHE
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" =
in
> the body of a message to [email protected]
> More majordomo info at =A0http://vger.kernel.org/majordomo-info.html
>