2010-04-16 15:52:49

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 0/3] pNFS generic device ID cache

This patch set implements a shared RCU device ID cache servicing multiple
mounts of a single layout type per meta data server (struct nfs_client).

0001-pnfs_submit-generic-device-ID-cache.patch
0002-pnfs_submit-fix-multiple-mount-set_pnfs_layoutdriver.patch
0003-pnfs-submit-file-layout-driver-generic-device-ID-cac.patch

These patches apply to the 2.6.34-rc3 pnfs branch below the following patch
which is the head of the revert patches - otherwise known as the first patch
not included in the pnfs file layout driver submission.

9a75b7356ca22b10903b507383646201cdcc9020

pnfs: filelayout: CB_NOTIFY_DEVICE support

This reverts commit 933b55c15ed2b7fd89d8e0342bd5b2825726201a
"SQUASHME pnfs_submit: remove filelayout CB_NOTIFY_DEVICE support"

Testing:

CONFIG_NFS_V4_1 set:

NFSv4.1/pNFS mounts:
Connectathon tests pass against GFS2/pNFS with a single AUTH_SYS mount, a double
AUTH_SYS mount, and an AUTH_SYS and AUTH_GSS/KRB5 mount (which creates
two superblocks under a struct nfs_client and both share the device id cache).

NFSv4.0 mount;
Connectathon tests pass

Did not test with multiple device ID's. I will create a mulitple device ID
test with the pynfs file layout server.

CONFIG_NFS_V4_1 not set:

NFSv4.0 mount: Connectathon tests pass.


-->Andy



2010-04-16 15:52:50

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache

From: Andy Adamson <[email protected]>

A shared RCU device ID cache servicing multiple mounts of a single layout type
per meta data server (struct nfs_client).

Device IDs of type deviceid4 are required by all layout types, long lived and
read at each I/O. They are added to the deviceid cache at first reference by
a layout via GETDEVICEINFO and (currently) are only removed at umount.

Reference count the device ID cache for each mounted file system
in the initialize_mountpoint layoutdriver_io_operation.

Dereference the device id cache on file system in the uninitialize_mountpoint
layoutdriver_io_operation called at umount

Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/pnfs.c | 119 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/nfs4_pnfs.h | 27 ++++++++++
include/linux/nfs_fs_sb.h | 1 +
3 files changed, 147 insertions(+), 0 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 91572aa..8492aef 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -45,6 +45,7 @@
#include <linux/nfs4.h>
#include <linux/pnfs_xdr.h>
#include <linux/nfs4_pnfs.h>
+#include <linux/rculist.h>

#include "internal.h"
#include "nfs4_fs.h"
@@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {

EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
EXPORT_SYMBOL(pnfs_register_layoutdriver);
+
+
+/* Device ID cache. Supports one layout type per struct nfs_client */
+int
+nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
+ void (*free_callback)(struct rcu_head *))
+{
+ struct nfs4_deviceid_cache *c;
+
+ c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
+ if (!c)
+ return -ENOMEM;
+ spin_lock(&clp->cl_lock);
+ if (clp->cl_devid_cache != NULL) {
+ kref_get(&clp->cl_devid_cache->dc_kref);
+ spin_unlock(&clp->cl_lock);
+ dprintk("%s [kref [%d]]\n", __func__,
+ atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
+ kfree(c);
+ } else {
+ spin_lock_init(&c->dc_lock);
+ INIT_LIST_HEAD(&c->dc_deviceids);
+ kref_init(&c->dc_kref);
+ c->dc_free_callback = free_callback;
+ c->dc_clp = clp;
+ clp->cl_devid_cache = c;
+ spin_unlock(&clp->cl_lock);
+ dprintk("%s [new]\n", __func__);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
+
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid *d)
+{
+ INIT_LIST_HEAD(&d->de_node);
+ INIT_RCU_HEAD(&d->de_rcu);
+}
+EXPORT_SYMBOL(nfs4_init_deviceid_node);
+
+struct nfs4_deviceid *
+nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
+{
+ struct nfs4_deviceid *d;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+ if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
+ rcu_read_unlock();
+ return d;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+EXPORT_SYMBOL(nfs4_find_deviceid);
+
+/*
+ * Add or kref_get a deviceid.
+ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ */
+void
+nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
+{
+ struct nfs4_deviceid *d;
+
+ spin_lock(&c->dc_lock);
+ list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+ if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
+ spin_unlock(&c->dc_lock);
+ dprintk("%s [discard]\n", __func__);
+ c->dc_free_callback(&new->de_rcu);
+ }
+ }
+ list_add_rcu(&new->de_node, &c->dc_deviceids);
+ spin_unlock(&c->dc_lock);
+ dprintk("%s [new]\n", __func__);
+}
+EXPORT_SYMBOL(nfs4_add_deviceid);
+
+static int
+nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
+{
+ struct nfs4_deviceid *d;
+
+ spin_lock(&c->dc_lock);
+ list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
+ list_del_rcu(&d->de_node);
+ spin_unlock(&c->dc_lock);
+ synchronize_rcu();
+ c->dc_free_callback(&d->de_rcu);
+ return 1;
+ }
+ spin_unlock(&c->dc_lock);
+ return 0;
+}
+
+static void
+nfs4_free_deviceid_cache(struct kref *kref)
+{
+ struct nfs4_deviceid_cache *cache =
+ container_of(kref, struct nfs4_deviceid_cache, dc_kref);
+ int more = 1;
+
+ while (more)
+ more = nfs4_remove_deviceid(cache);
+ cache->dc_clp->cl_devid_cache = NULL;
+ kfree(cache);
+}
+
+void
+nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
+{
+ dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
+ kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
+}
+EXPORT_SYMBOL(nfs4_put_deviceid_cache);
diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
index 1d521f4..2a88a06 100644
--- a/include/linux/nfs4_pnfs.h
+++ b/include/linux/nfs4_pnfs.h
@@ -281,6 +281,33 @@ struct pnfs_devicelist {
struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
};

+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+struct nfs4_deviceid_cache {
+ spinlock_t dc_lock;
+ struct list_head dc_deviceids;
+ struct kref dc_kref;
+ void (*dc_free_callback)(struct rcu_head *);
+ struct nfs_client *dc_clp;
+};
+
+/* Device ID cache node */
+struct nfs4_deviceid {
+ struct list_head de_node;
+ struct rcu_head de_rcu;
+ struct pnfs_deviceid de_id;
+};
+
+extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
+ void (*free_callback)(struct rcu_head *));
+extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
+extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
+extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
+ struct pnfs_deviceid *);
+extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
+ struct nfs4_deviceid *);
+
/* pNFS client callback functions.
* These operations allow the layout driver to access pNFS client
* specific information or call pNFS client->server operations.
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 8522461..ef2e18e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -87,6 +87,7 @@ struct nfs_client {
u32 cl_exchange_flags;
struct nfs4_session *cl_session; /* sharred session */
struct list_head cl_lo_inodes; /* Inodes having layouts */
+ struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
#endif /* CONFIG_NFS_V4_1 */

#ifdef CONFIG_NFS_FSCACHE
--
1.6.6


2010-04-16 15:52:52

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 2/3] SQUASHME pnfs_submit: fix multiple mount set_pnfs_layoutdriver

From: Andy Adamson <[email protected]>

The same struct nfs_server can enter set_pnfs_layoutdriver for mounts that
share a super block. Don't initialize a pnfs mountpoint more than once.

Don't set the pnfs_curr_ld until the pnfs mountpoint initialization succeeds

Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/pnfs.c | 15 ++++++++++-----
1 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8492aef..1d903fd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -215,20 +215,25 @@ set_pnfs_layoutdriver(struct super_block *sb, struct nfs_fh *fh, u32 id)
struct pnfs_mount_type *mt;
struct nfs_server *server = NFS_SB(sb);

+ if (server->pnfs_curr_ld)
+ return;
+
if (id > 0 && find_pnfs(id, &mod)) {
- dprintk("%s: Setting pNFS module\n", __func__);
- server->pnfs_curr_ld = mod->pnfs_ld_type;
- mt = server->pnfs_curr_ld->ld_io_ops->initialize_mountpoint(
+ mt = mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint(
sb, fh);
if (!mt) {
printk(KERN_ERR "%s: Error initializing mount point "
"for layout driver %u. ", __func__, id);
goto out_err;
}
- /* Layout driver succeeded in initializing mountpoint */
+ /*
+ * Layout driver succeeded in initializing mountpoint
+ * and has taken a reference on the nfs_client cl_devid_cache
+ */
+ server->pnfs_curr_ld = mod->pnfs_ld_type;
server->pnfs_mountid = mt;
- /* Set the rpc_ops */
server->nfs_client->rpc_ops = &pnfs_v4_clientops;
+ dprintk("%s: pNFS module for %u set\n", __func__, id);
return;
}

--
1.6.6


2010-04-16 15:52:53

by Andy Adamson

[permalink] [raw]
Subject: [PATCH 3/3] SQUASHME pnfs-submit: file layout driver generic device ID cache

From: Andy Adamson <[email protected]>

Replace the per superblock deviceid cache with the generic deviceid cache.

Embed struct nfs4_deviceid into struct nfs4_file_layout_dsaddr, the file layout
specific deviceid structure. Provide a free_deviceid_callback.

Signed-off-by: Andy Adamson <[email protected]>
---
fs/nfs/client.c | 1 +
fs/nfs/nfs4filelayout.c | 31 +++-----
fs/nfs/nfs4filelayout.h | 12 +--
fs/nfs/nfs4filelayoutdev.c | 190 ++++++++------------------------------------
4 files changed, 51 insertions(+), 183 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e1d1646..82775b7 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -38,6 +38,7 @@
#include <net/ipv6.h>
#include <linux/nfs_xdr.h>
#include <linux/sunrpc/bc_xprt.h>
+#include <linux/nfs4_pnfs.h>

#include <asm/system.h>

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 0530b59..c155586 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -76,17 +76,11 @@ filelayout_initialize_mountpoint(struct super_block *sb, struct nfs_fh *fh)
{
struct filelayout_mount_type *fl_mt;
struct pnfs_mount_type *mt;
- int status;

fl_mt = kmalloc(sizeof(struct filelayout_mount_type), GFP_KERNEL);
if (!fl_mt)
goto error_ret;

- /* Initialize nfs4 file layout specific device list structure */
- fl_mt->hlist = kmalloc(sizeof(struct nfs4_pnfs_dev_hlist), GFP_KERNEL);
- if (!fl_mt->hlist)
- goto cleanup_fl_mt;
-
mt = kmalloc(sizeof(struct pnfs_mount_type), GFP_KERNEL);
if (!mt)
goto cleanup_fl_mt;
@@ -94,11 +88,11 @@ filelayout_initialize_mountpoint(struct super_block *sb, struct nfs_fh *fh)
fl_mt->fl_sb = sb;
mt->mountid = (void *)fl_mt;

- status = nfs4_pnfs_devlist_init(fl_mt->hlist);
- if (status)
+ if (nfs4_alloc_init_deviceid_cache(NFS_SB(sb)->nfs_client,
+ nfs4_fl_free_deviceid_callback))
goto cleanup_mt;

- dprintk("%s: device list has been initialized successfully\n",
+ dprintk("%s: deviceid cache has been initialized successfully\n",
__func__);
return mt;

@@ -106,11 +100,10 @@ cleanup_mt: ;
kfree(mt);

cleanup_fl_mt: ;
- kfree(fl_mt->hlist);
kfree(fl_mt);

error_ret: ;
- printk(KERN_WARNING "%s: device list could not be initialized\n",
+ printk(KERN_WARNING "%s: deviceid cache could not be initialized\n",
__func__);

return NULL;
@@ -123,13 +116,14 @@ filelayout_uninitialize_mountpoint(struct pnfs_mount_type *mountid)
{
struct filelayout_mount_type *fl_mt = NULL;

+ dprintk("--> %s\n", __func__);
if (mountid) {
- fl_mt = (struct filelayout_mount_type *)mountid->mountid;
+ struct nfs4_deviceid_cache *cache;

- if (fl_mt != NULL) {
- nfs4_pnfs_devlist_destroy(fl_mt->hlist);
- kfree(fl_mt);
- }
+ fl_mt = (struct filelayout_mount_type *)mountid->mountid;
+ cache = NFS_SB(fl_mt->fl_sb)->nfs_client->cl_devid_cache;
+ nfs4_put_deviceid_cache(cache);
+ kfree(fl_mt);
kfree(mountid);
}
return 0;
@@ -381,8 +375,7 @@ filelayout_check_layout(struct pnfs_layout_type *lo,
struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo));

dprintk("--> %s\n", __func__);
- dsaddr = nfs4_pnfs_device_item_find(FILE_MT(PNFS_INODE(lo))->hlist,
- &fl->dev_id);
+ dsaddr = nfs4_pnfs_device_item_find(nfss->nfs_client, &fl->dev_id);
if (dsaddr == NULL) {
dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id);
if (dsaddr == NULL) {
@@ -618,7 +611,7 @@ filelayout_commit(struct pnfs_layout_type *layoutid, int sync,
stripesz = filelayout_get_stripesize(layoutid);
dprintk("%s stripesize %Zd\n", __func__, stripesz);

- dsaddr = nfs4_pnfs_device_item_find(FILE_MT(data->inode)->hlist,
+ dsaddr = nfs4_pnfs_device_item_find(NFS_SERVER(data->inode)->nfs_client,
&nfslay->dev_id);
if (dsaddr == NULL) {
data->pdata.pnfs_error = -EIO;
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 12498a2..2cb05bd 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -43,8 +43,7 @@ struct nfs4_pnfs_ds {
};

struct nfs4_file_layout_dsaddr {
- struct hlist_node hash_node; /* nfs4_pnfs_dev_hlist dev_list */
- struct pnfs_deviceid dev_id;
+ struct nfs4_deviceid deviceid;
u32 stripe_count;
u8 *stripe_indices;
u32 ds_num;
@@ -86,15 +85,13 @@ struct nfs4_filelayout {

struct filelayout_mount_type {
struct super_block *fl_sb;
- struct nfs4_pnfs_dev_hlist *hlist;
};

extern struct pnfs_client_operations *pnfs_callback_ops;

+extern void nfs4_fl_free_deviceid_callback(struct rcu_head *);
extern void print_ds(struct nfs4_pnfs_ds *ds);
char *deviceid_fmt(const struct pnfs_deviceid *dev_id);
-int nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist);
-void nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist);
int nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
loff_t offset,
size_t count,
@@ -102,9 +99,8 @@ int nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
u32 filelayout_dserver_get_index(loff_t offset,
struct nfs4_file_layout_dsaddr *di,
struct nfs4_filelayout_segment *layout);
-struct nfs4_file_layout_dsaddr *
-nfs4_pnfs_device_item_find(struct nfs4_pnfs_dev_hlist *hlist,
- struct pnfs_deviceid *dev_id);
+extern struct nfs4_file_layout_dsaddr *
+nfs4_pnfs_device_item_find(struct nfs_client *, struct pnfs_deviceid *dev_id);
struct nfs4_file_layout_dsaddr *
get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id);

diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 813ddbb..411ffcb 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -45,6 +45,7 @@

#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/nfs4_pnfs.h>
#include <linux/pnfs_xdr.h>
#include "nfs4filelayout.h"
#include "internal.h"
@@ -98,42 +99,6 @@ deviceid_fmt(const struct pnfs_deviceid *dev_id)
return buf;
}

-unsigned long
-_deviceid_hash(const struct pnfs_deviceid *dev_id)
-{
- unsigned char *cptr = (unsigned char *)dev_id->data;
- unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE;
- u64 x = 0;
-
- while (nbytes--) {
- x *= 37;
- x += *cptr++;
- }
- return x & NFS4_PNFS_DEV_HASH_MASK;
-}
-
-/* Assumes lock is held */
-static inline struct nfs4_file_layout_dsaddr *
-_device_lookup(struct nfs4_pnfs_dev_hlist *hlist,
- const struct pnfs_deviceid *dev_id)
-{
- unsigned long hash;
- struct hlist_node *np;
-
- dprintk("_device_lookup: dev_id=%s\n", deviceid_fmt(dev_id));
-
- hash = _deviceid_hash(dev_id);
-
- hlist_for_each(np, &hlist->dev_list[hash]) {
- struct nfs4_file_layout_dsaddr *dsaddr;
- dsaddr = hlist_entry(np, struct nfs4_file_layout_dsaddr,
- hash_node);
- if (!memcmp(&dsaddr->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE))
- return dsaddr;
- }
- return NULL;
-}
-
/* nfs4_ds_cache_lock is held */
static inline struct nfs4_pnfs_ds *
_data_server_lookup(u32 ip_addr, u32 port)
@@ -152,22 +117,6 @@ _data_server_lookup(u32 ip_addr, u32 port)
return NULL;
}

-
-/* Assumes lock is held */
-static inline void
-_device_add(struct nfs4_pnfs_dev_hlist *hlist,
- struct nfs4_file_layout_dsaddr *dsaddr)
-{
- unsigned long hash;
-
- dprintk("_device_add: dev_id=%s ds_list:\n",
- deviceid_fmt(&dsaddr->dev_id));
- print_ds_list(dsaddr);
-
- hash = _deviceid_hash(&dsaddr->dev_id);
- hlist_add_head(&dsaddr->hash_node, &hlist->dev_list[hash]);
-}
-
/* Create an rpc to the data server defined in 'dev_list' */
static int
nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
@@ -269,118 +218,47 @@ out_put:
static void
destroy_ds(struct nfs4_pnfs_ds *ds)
{
+ dprintk("--> %s\n", __func__);
+ print_ds(ds);
+
if (ds->ds_clp)
nfs_put_client(ds->ds_clp);
kfree(ds);
}

-/* Assumes lock is NOT held */
static void
-nfs4_pnfs_device_destroy(struct nfs4_file_layout_dsaddr *dsaddr,
- struct nfs4_pnfs_dev_hlist *hlist)
+nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
struct nfs4_pnfs_ds *ds;
- LIST_HEAD(release);
int i;

- if (!dsaddr)
- return;
-
- dprintk("%s: dev_id=%s\ndev_list:\n", __func__,
- deviceid_fmt(&dsaddr->dev_id));
- print_ds_list(dsaddr);
-
- write_lock(&hlist->dev_lock);
- hlist_del_init(&dsaddr->hash_node);
+ dprintk("%s: device id=%s\n", __func__,
+ deviceid_fmt(&dsaddr->deviceid.de_id));

for (i = 0; i < dsaddr->ds_num; i++) {
ds = dsaddr->ds_list[i];
if (ds != NULL) {
- /* if we are last user - move to release list */
if (atomic_dec_and_lock(&ds->ds_count,
&nfs4_ds_cache_lock)) {
list_del_init(&ds->ds_node);
spin_unlock(&nfs4_ds_cache_lock);
- list_add(&ds->ds_node, &release);
+ destroy_ds(ds);
}
}
}
- write_unlock(&hlist->dev_lock);
- while (!list_empty(&release)) {
- ds = list_entry(release.next, struct nfs4_pnfs_ds, ds_node);
- list_del(&ds->ds_node);
- destroy_ds(ds);
- }
+ kfree(dsaddr->stripe_indices);
kfree(dsaddr);
}

-int
-nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist)
-{
- int i;
-
- rwlock_init(&hlist->dev_lock);
-
- for (i = 0; i < NFS4_PNFS_DEV_HASH_SIZE; i++) {
- INIT_HLIST_HEAD(&hlist->dev_list[i]);
- }
-
- return 0;
-}
-
-/* De-alloc all devices for a mount point. This is called in
- * nfs4_kill_super.
- */
void
-nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist)
-{
- int i;
-
- if (hlist == NULL)
- return;
-
- /* No lock held, as synchronization should occur at upper levels */
- for (i = 0; i < NFS4_PNFS_DEV_HASH_SIZE; i++) {
- struct hlist_node *np, *next;
-
- hlist_for_each_safe(np, next, &hlist->dev_list[i]) {
- struct nfs4_file_layout_dsaddr *dsaddr;
- dsaddr = hlist_entry(np,
- struct nfs4_file_layout_dsaddr,
- hash_node);
- /* nfs4_pnfs_device_destroy grabs hlist->dev_lock */
- nfs4_pnfs_device_destroy(dsaddr, hlist);
- }
- }
-}
-
-/*
- * Add the device to the list of available devices for this mount point.
- * The * rpc client is created during first I/O.
- */
-static int
-nfs4_pnfs_device_add(struct filelayout_mount_type *mt,
- struct nfs4_file_layout_dsaddr *dsaddr)
+nfs4_fl_free_deviceid_callback(struct rcu_head *rcu)
{
- struct nfs4_file_layout_dsaddr *tmp_dsaddr;
- struct nfs4_pnfs_dev_hlist *hlist = mt->hlist;
-
- dprintk("nfs4_pnfs_device_add\n");
-
- /* Write lock, do lookup again, and then add device */
- write_lock(&hlist->dev_lock);
- tmp_dsaddr = _device_lookup(hlist, &dsaddr->dev_id);
- if (tmp_dsaddr == NULL)
- _device_add(hlist, dsaddr);
- write_unlock(&hlist->dev_lock);
-
- /* Cleanup, if device was recently added */
- if (tmp_dsaddr != NULL) {
- dprintk(" device found, not adding (after creation)\n");
- nfs4_pnfs_device_destroy(dsaddr, hlist);
- }
+ struct nfs4_deviceid *device =
+ container_of(rcu, struct nfs4_deviceid, de_rcu);
+ struct nfs4_file_layout_dsaddr *dsaddr =
+ container_of(device, struct nfs4_file_layout_dsaddr, deviceid);

- return 0;
+ nfs4_fl_free_deviceid(dsaddr);
}

static void
@@ -514,7 +392,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
dsaddr->stripe_count = cnt;
dsaddr->ds_num = num;

- memcpy(&dsaddr->dev_id, &pdev->dev_id, NFS4_PNFS_DEVICEID4_SIZE);
+ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id,
+ NFS4_PNFS_DEVICEID4_SIZE);

/* Go back an read stripe indices */
p = indicesp;
@@ -553,19 +432,20 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
}
}
}
+ nfs4_init_deviceid_node(&dsaddr->deviceid);
+
return dsaddr;

out_err_free:
- nfs4_pnfs_device_destroy(dsaddr, FILE_MT(ino)->hlist);
+ nfs4_fl_free_deviceid(dsaddr);
out_err:
dprintk("%s ERROR: returning NULL\n", __func__);
return NULL;
}

-/* Decode the opaque device specified in 'dev'
- * and add it to the list of available devices for this
- * mount point.
- * Must at some point be followed up with nfs4_pnfs_device_destroy
+/*
+ * Decode the opaque device specified in 'dev'
+ * and add it to the list of available devices
*/
static struct nfs4_file_layout_dsaddr*
decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
@@ -574,14 +454,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev)

dsaddr = decode_device(inode, dev);
if (!dsaddr) {
- printk(KERN_WARNING "%s: Could not decode device\n",
+ printk(KERN_WARNING "%s: Could not decode or add device\n",
__func__);
- nfs4_pnfs_device_destroy(dsaddr, FILE_MT(inode)->hlist);
return NULL;
}

- if (nfs4_pnfs_device_add(FILE_MT(inode), dsaddr))
- return NULL;
+ nfs4_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+ &dsaddr->deviceid);

return dsaddr;
}
@@ -660,16 +539,15 @@ out_free:
}

struct nfs4_file_layout_dsaddr *
-nfs4_pnfs_device_item_find(struct nfs4_pnfs_dev_hlist *hlist,
- struct pnfs_deviceid *dev_id)
+nfs4_pnfs_device_item_find(struct nfs_client *clp, struct pnfs_deviceid *id)
{
- struct nfs4_file_layout_dsaddr *dsaddr;
-
- read_lock(&hlist->dev_lock);
- dsaddr = _device_lookup(hlist, dev_id);
- read_unlock(&hlist->dev_lock);
+ struct nfs4_deviceid *d;

- return dsaddr;
+ d = nfs4_find_deviceid(clp->cl_devid_cache, id);
+ dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__,
+ deviceid_fmt(id), d);
+ return (d == NULL) ? NULL :
+ container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
}

/* Want res = ((offset / layout->stripe_unit) % dsaddr->stripe_count)
@@ -707,7 +585,7 @@ nfs4_pnfs_dserver_get(struct pnfs_layout_segment *lseg,
if (!layout)
return 1;

- dsaddr = nfs4_pnfs_device_item_find(FILE_MT(inode)->hlist,
+ dsaddr = nfs4_pnfs_device_item_find(NFS_SERVER(inode)->nfs_client,
&layout->dev_id);
if (dsaddr == NULL)
return 1;
--
1.6.6


2010-04-16 16:04:56

by Andy Adamson

[permalink] [raw]
Subject: Re: [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache

On Fri, Apr 16, 2010 at 11:52 AM, <[email protected]> wrote:
> From: Andy Adamson <[email protected]>
>
> A shared RCU device ID cache servicing multiple mounts of a single la=
yout type
> per meta data server (struct nfs_client).
>
> Device IDs of type deviceid4 are required by all layout types, long l=
ived and
> read at each I/O. =A0They are added to the deviceid cache at first re=
ference by
> a layout via GETDEVICEINFO and (currently) are only removed at umount=
=2E
>
> Reference count the device ID cache for each mounted file system
> in the initialize_mountpoint layoutdriver_io_operation.
>
> Dereference the device id cache on file system in the uninitialize_mo=
untpoint
> layoutdriver_io_operation called at umount
>
> Signed-off-by: Andy Adamson <[email protected]>
> ---
> =A0fs/nfs/pnfs.c =A0 =A0 =A0 =A0 =A0 =A0 | =A0119 +++++++++++++++++++=
++++++++++++++++++++++++++
> =A0include/linux/nfs4_pnfs.h | =A0 27 ++++++++++
> =A0include/linux/nfs_fs_sb.h | =A0 =A01 +
> =A03 files changed, 147 insertions(+), 0 deletions(-)
>
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index 91572aa..8492aef 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -45,6 +45,7 @@
> =A0#include <linux/nfs4.h>
> =A0#include <linux/pnfs_xdr.h>
> =A0#include <linux/nfs4_pnfs.h>
> +#include <linux/rculist.h>
>
> =A0#include "internal.h"
> =A0#include "nfs4_fs.h"
> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops =3D {
>
> =A0EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
> =A0EXPORT_SYMBOL(pnfs_register_layoutdriver);
> +
> +
> +/* Device ID cache. Supports one layout type per struct nfs_client *=
/
> +int
> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0void (*free_callback=
)(struct rcu_head *))
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *c;
> +
> + =A0 =A0 =A0 c =3D kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_K=
ERNEL);
> + =A0 =A0 =A0 if (!c)
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return -ENOMEM;
> + =A0 =A0 =A0 spin_lock(&clp->cl_lock);
> + =A0 =A0 =A0 if (clp->cl_devid_cache !=3D NULL) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_get(&clp->cl_devid_cache->dc_kref)=
;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [kref [%d]]\n", __func__,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 atomic_read(&clp->cl_de=
vid_cache->dc_kref.refcount));
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kfree(c);
> + =A0 =A0 =A0 } else {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_lock_init(&c->dc_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 INIT_LIST_HEAD(&c->dc_deviceids);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 kref_init(&c->dc_kref);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback =3D free_callback;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_clp =3D clp;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 clp->cl_devid_cache =3D c;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&clp->cl_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 return 0;
> +}
> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
> +
> +void
> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
> +{
> + =A0 =A0 =A0 INIT_LIST_HEAD(&d->de_node);
> + =A0 =A0 =A0 INIT_RCU_HEAD(&d->de_rcu);
> +}
> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
> +
> +struct nfs4_deviceid *
> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_device=
id *id)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 rcu_read_lock();
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, id, NFS4_PNFS_DE=
VICEID4_SIZE)) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 rcu_read_unlock();
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return d;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 rcu_read_unlock();
> + =A0 =A0 =A0 return NULL;
> +}
> +EXPORT_SYMBOL(nfs4_find_deviceid);
> +
> +/*
> + * Add or kref_get a deviceid.
> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, =
discard new
> + */
> +void
> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_devicei=
d *new)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (!memcmp(&d->de_id, &new->de_id, NFS=
4_PNFS_DEVICEID4_SIZE)) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock=
);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 dprintk("%s [discard]\n=
", __func__);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&ne=
w->de_rcu);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 list_add_rcu(&new->de_node, &c->dc_deviceids);
> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 dprintk("%s [new]\n", __func__);
> +}
> +EXPORT_SYMBOL(nfs4_add_deviceid);
> +
> +static int
> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid *d;
> +
> + =A0 =A0 =A0 spin_lock(&c->dc_lock);
> + =A0 =A0 =A0 list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 list_del_rcu(&d->de_node);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 synchronize_rcu();
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 c->dc_free_callback(&d->de_rcu);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 return 1;
> + =A0 =A0 =A0 }
> + =A0 =A0 =A0 spin_unlock(&c->dc_lock);
> + =A0 =A0 =A0 return 0;
> +}
> +
> +static void
> +nfs4_free_deviceid_cache(struct kref *kref)
> +{
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cache =3D
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 container_of(kref, struct nfs4_deviceid=
_cache, dc_kref);
> + =A0 =A0 =A0 int more =3D 1;
> +
> + =A0 =A0 =A0 while (more)
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 more =3D nfs4_remove_deviceid(cache);
> + =A0 =A0 =A0 cache->dc_clp->cl_devid_cache =3D NULL;

Need to take the cl_lock around this assignment

spin_lock(&cache->dc_clp->cl_lock);
cache->dc_clp->cl_devid_cache =3D NULL
spin_unlock(&cache->dc_clp->cl_lock);


> + =A0 =A0 =A0 kfree(cache);
> +}
> +
> +void
> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
> +{
> + =A0 =A0 =A0 dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.=
refcount));
> + =A0 =A0 =A0 kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
> +}
> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
> index 1d521f4..2a88a06 100644
> --- a/include/linux/nfs4_pnfs.h
> +++ b/include/linux/nfs4_pnfs.h
> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
> =A0 =A0 =A0 =A0struct pnfs_deviceid =A0 =A0dev_id[NFS4_PNFS_GETDEVLIS=
T_MAXNUM];
> =A0};
>
> +/*
> + * Device ID RCU cache. A device ID is unique per client ID and layo=
ut type.
> + */
> +struct nfs4_deviceid_cache {
> + =A0 =A0 =A0 spinlock_t =A0 =A0 =A0 =A0 =A0 =A0 =A0dc_lock;
> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0dc_deviceids;
> + =A0 =A0 =A0 struct kref =A0 =A0 =A0 =A0 =A0 =A0 dc_kref;
> + =A0 =A0 =A0 void =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0(*dc_free_c=
allback)(struct rcu_head *);
> + =A0 =A0 =A0 struct nfs_client =A0 =A0 =A0 *dc_clp;
> +};
> +
> +/* Device ID cache node */
> +struct nfs4_deviceid {
> + =A0 =A0 =A0 struct list_head =A0 =A0 =A0 =A0de_node;
> + =A0 =A0 =A0 struct rcu_head =A0 =A0 =A0 =A0 de_rcu;
> + =A0 =A0 =A0 struct pnfs_deviceid =A0 =A0de_id;
> +};
> +
> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 void (*=
free_callback)(struct rcu_head *));
> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid=
_cache *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct =
pnfs_deviceid *);
> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 struct =
nfs4_deviceid *);
> +
> =A0/* pNFS client callback functions.
> =A0* These operations allow the layout driver to access pNFS client
> =A0* specific information or call pNFS client->server operations.
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 8522461..ef2e18e 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -87,6 +87,7 @@ struct nfs_client {
> =A0 =A0 =A0 =A0u32 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 cl_exchang=
e_flags;
> =A0 =A0 =A0 =A0struct nfs4_session =A0 =A0 *cl_session; =A0 =A0/* sha=
rred session */
> =A0 =A0 =A0 =A0struct list_head =A0 =A0 =A0 =A0cl_lo_inodes; =A0 /* I=
nodes having layouts */
> + =A0 =A0 =A0 struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS dev=
iceid cache */
> =A0#endif /* CONFIG_NFS_V4_1 */
>
> =A0#ifdef CONFIG_NFS_FSCACHE
> --
> 1.6.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" =
in
> the body of a message to [email protected]
> More majordomo info at =A0http://vger.kernel.org/majordomo-info.html
>