2012-02-07 00:57:24

by Myklebust, Trond

[permalink] [raw]
Subject: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

It is perfectly legal to negotiate up to 2^32-1 slots in the protocol,
and with 10GigE, we are already seeing that 255 slots is far too limiting.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/callback.h | 2 +-
fs/nfs/callback_xdr.c | 6 +++---
fs/nfs/nfs4proc.c | 21 ++++++++++-----------
include/linux/nfs_fs_sb.h | 7 ++++---
include/linux/nfs_xdr.h | 2 +-
5 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 197e0d3..a5527c9 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,7 +38,7 @@ enum nfs4_callback_opnum {
struct cb_process_state {
__be32 drc_status;
struct nfs_client *clp;
- int slotid;
+ u32 slotid;
struct net *net;
};

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 2e37224..5466829 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -759,14 +759,14 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
* Let the state manager know callback processing done.
* A single slot, so highest used slotid is either 0 or -1
*/
- tbl->highest_used_slotid = -1;
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
nfs4_check_drain_bc_complete(session);
spin_unlock(&tbl->slot_tbl_lock);
}

static void nfs4_cb_free_slot(struct cb_process_state *cps)
{
- if (cps->slotid != -1)
+ if (cps->slotid != NFS4_NO_SLOT)
nfs4_callback_free_slot(cps->clp->cl_session);
}

@@ -860,7 +860,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
struct cb_process_state cps = {
.drc_status = 0,
.clp = NULL,
- .slotid = -1,
+ .slotid = NFS4_NO_SLOT,
.net = rqstp->rq_xprt->xpt_net,
};
unsigned int nops = 0;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 482ed97..0dc5dfb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -402,7 +402,7 @@ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
return;
}

- if (ses->fc_slot_table.highest_used_slotid != -1)
+ if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
return;

dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
@@ -415,7 +415,7 @@ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
{
if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
- ses->bc_slot_table.highest_used_slotid != -1)
+ ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
return;
dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
complete(&ses->bc_slot_table.complete);
@@ -514,14 +514,13 @@ static int nfs4_sequence_done(struct rpc_task *task,
*
* Note: must be called with under the slot_tbl_lock.
*/
-static u8
+static u32
nfs4_find_slot(struct nfs4_slot_table *tbl)
{
- int slotid;
- u8 ret_id = NFS4_MAX_SLOT_TABLE;
- BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
+ u32 slotid;
+ u32 ret_id = NFS4_NO_SLOT;

- dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n",
+ dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
__func__, tbl->used_slots[0], tbl->highest_used_slotid,
tbl->max_slots);
slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
@@ -555,7 +554,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
{
struct nfs4_slot *slot;
struct nfs4_slot_table *tbl;
- u8 slotid;
+ u32 slotid;

dprintk("--> %s\n", __func__);
/* slot already allocated? */
@@ -5144,7 +5143,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
spin_lock(&tbl->slot_tbl_lock);
tbl->max_slots = max_slots;
tbl->slots = slot;
- tbl->highest_used_slotid = -1; /* no slot is currently used */
+ tbl->highest_used_slotid = NFS4_NO_SLOT; /* no slot is currently used */
spin_unlock(&tbl->slot_tbl_lock);
dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
tbl, tbl->slots, tbl->max_slots);
@@ -5196,13 +5195,13 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
return NULL;

tbl = &session->fc_slot_table;
- tbl->highest_used_slotid = -1;
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
spin_lock_init(&tbl->slot_tbl_lock);
rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
init_completion(&tbl->complete);

tbl = &session->bc_slot_table;
- tbl->highest_used_slotid = -1;
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
spin_lock_init(&tbl->slot_tbl_lock);
rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
init_completion(&tbl->complete);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 9e101c1..2ae57f2 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -191,6 +191,7 @@ struct nfs_server {

/* maximum number of slots to use */
#define NFS4_MAX_SLOT_TABLE (128U)
+#define NFS4_NO_SLOT ((u32)-1)

#if defined(CONFIG_NFS_V4)

@@ -201,10 +202,10 @@ struct nfs4_slot_table {
unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
spinlock_t slot_tbl_lock;
struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */
- int max_slots; /* # slots in table */
- int highest_used_slotid; /* sent to server on each SEQ.
+ u32 max_slots; /* # slots in table */
+ u32 highest_used_slotid; /* sent to server on each SEQ.
* op for dynamic resizing */
- int target_max_slots; /* Set by CB_RECALL_SLOT as
+ u32 target_max_slots; /* Set by CB_RECALL_SLOT as
* the new max_slots */
struct completion complete;
};
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 144419a..adbc84a 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -181,7 +181,7 @@ struct nfs4_slot {

struct nfs4_sequence_args {
struct nfs4_session *sa_session;
- u8 sa_slotid;
+ u32 sa_slotid;
u8 sa_cache_this;
};

--
1.7.7.6



2012-02-08 20:51:21

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

T24gV2VkLCAyMDEyLTAyLTA4IGF0IDE1OjMxIC0wNTAwLCBKaW0gUmVlcyB3cm90ZToNCj4gSi4g
QnJ1Y2UgRmllbGRzIHdyb3RlOg0KPiANCj4gICBPbiBXZWQsIEZlYiAwOCwgMjAxMiBhdCAxMjo0
OTowMVBNIC0wNTAwLCBKaW0gUmVlcyB3cm90ZToNCj4gICA+IE15a2xlYnVzdCwgVHJvbmQgd3Jv
dGU6DQo+ICAgPiANCj4gICA+ICAgMTBHaWdFICsgaGlnaCBsYXRlbmNpZXMgaXMgZXhhY3RseSB3
aGVyZSB3ZSdyZSBzZWVpbmcgdGhlIHZhbHVlLiBBbmR5DQo+ICAgPiAgIGhhcyBiZWVuIHdvcmtp
bmcgd2l0aCB0aGUgaGlnaCBlbmVyZ3kgcGh5c2ljcyBjb21tdW5pdHkgZG9pbmcgTkZTDQo+ICAg
PiAgIHRyYWZmaWMgYmV0d2VlbiB0aGUgVVMgYW5kIENFUk4uLi4NCj4gICA+IA0KPiAgID4gQ0lU
SSB0byBDRVJOIGlzIGp1c3Qgb3ZlciAxMjBtcy4gIEkgZG9uJ3Qga25vdyB3aGF0IGl0IHdvdWxk
IGJlIGZyb20gQW5keSdzDQo+ICAgPiBob3VzZS4gIERvZXMgaGUgaGF2ZSAxMEcgYXQgaG9tZSB5
ZXQ/DQo+ICAgDQo+ICAgVGhhdCBzdGlsbCBzZWVtcyBzaG9ydCBvZiB3aGF0IHlvdSdkIG5lZWQg
dG8gZ2V0IGEgMjU1TUIgYmFuZHdpZHRoLWRlbGF5DQo+ICAgcHJvZHVjdC4NCj4gICANCj4gICBJ
J20ganVzdCBjdXJpb3VzIHdoYXQgdGhlIGV4cGVyaW1lbnQgaXMgaGVyZSBhbmQgd2hldGhlciB0
aGVyZSdzIGENCj4gICBwb3NzaWJpbGl0eSB0aGUgcmVhbCBwcm9ibGVtIGlzIGVsc2V3aGVyZS4N
Cj4gDQo+IEluIG15IG9waW5pb24sIGFueSBmaXggdGhhdCBpbnZvbHZlcyBhbGxvY2F0aW5nIG11
bHRpcGxlIHBhcmFsbGVsIGRhdGENCj4gc3RyZWFtcyAocnBjIHNsb3RzLCB0Y3AgY29ubmVjdGlv
bnMpIGlzIG1hc2tpbmcgdGhlIHJlYWwgcHJvYmxlbS4gIEJ1dCBpdCdzDQo+IGFuIGVmZmVjdGl2
ZSBmaXguDQoNCldobyBzYWlkIGFueXRoaW5nIGFib3V0IG11bHRpcGxlIHRjcCBjb25uZWN0aW9u
cz8gQWxsIHRoZSBzbG90cyBkbyBpcw0KYWxsb3cgdGhlIHNlcnZlciB0byBwcm9jZXNzIG1vcmUg
UlBDIGNhbGxzIGluIHBhcmFsbGVsIGJ5IGZlZWRpbmcgaXQNCm1vcmUgd29yay4gSG93IGlzIHRo
YXQgbWFza2luZyBhIHByb2JsZW0/DQoNCi0tIA0KVHJvbmQgTXlrbGVidXN0DQpMaW51eCBORlMg
Y2xpZW50IG1haW50YWluZXINCg0KTmV0QXBwDQpUcm9uZC5NeWtsZWJ1c3RAbmV0YXBwLmNvbQ0K
d3d3Lm5ldGFwcC5jb20NCg0K

2012-02-08 17:27:29

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

T24gV2VkLCAyMDEyLTAyLTA4IGF0IDExOjIzIC0wNTAwLCBKLiBCcnVjZSBGaWVsZHMgd3JvdGU6
DQo+IE9uIE1vbiwgRmViIDA2LCAyMDEyIGF0IDA3OjU3OjE2UE0gLTA1MDAsIFRyb25kIE15a2xl
YnVzdCB3cm90ZToNCj4gPiBJdCBpcyBwZXJmZWN0bHkgbGVnYWwgdG8gbmVnb3RpYXRlIHVwIHRv
IDJeMzItMSBzbG90cyBpbiB0aGUgcHJvdG9jb2wsDQo+ID4gYW5kIHdpdGggMTBHaWdFLCB3ZSBh
cmUgYWxyZWFkeSBzZWVpbmcgdGhhdCAyNTUgc2xvdHMgaXMgZmFyIHRvbyBsaW1pdGluZy4NCj4g
DQo+IFNlZW1zIGxpa2UgYW4gZW50aXJlbHkgcmVhc29uYWJsZSBjaGFuZ2UsIGJ1dCBqdXN0IG91
dCBvZiBjdXJpb3NpdHksDQo+IHdoYXQgd29ya2xvYWQgYXJlIHlvdSB1c2luZyB0byBoaXQgdGhh
dCBsaW1pdD8NCj4gDQo+IChJZiBpdCdzIGp1c3QgYnVsayBJTyBvdmVyIGEgbG9jYWwgMTBHaWdF
IG5ldHdvcmssIHRoZW4gc3VyZWx5IHRoZQ0KPiBwcm9ibGVtIGlzIGVsc2V3aGVyZT8gIEV2ZW4g
b24gMTBHaWdFIEkgdGhpbmsgeW91J2QgbmVlZCBmYWlybHkgaGlnaA0KPiBsYXRlbmNpZXMgKGEg
bGl0dGxlIG92ZXIgMjAwbXMsIGlmIG15IG1hdGggaXMgcmlnaHQ/KSB0byBrZWVwIHRoYXQgbXVj
aA0KPiBkYXRhIGluIGZsaWdodC4gIE9yIGFyZSB5b3UgdGVzdGluZyBtdWNoIHNtYWxsZXIgb3Bl
cmF0aW9ucz8pDQoNCjEwR2lnRSArIGhpZ2ggbGF0ZW5jaWVzIGlzIGV4YWN0bHkgd2hlcmUgd2Un
cmUgc2VlaW5nIHRoZSB2YWx1ZS4gQW5keQ0KaGFzIGJlZW4gd29ya2luZyB3aXRoIHRoZSBoaWdo
IGVuZXJneSBwaHlzaWNzIGNvbW11bml0eSBkb2luZyBORlMNCnRyYWZmaWMgYmV0d2VlbiB0aGUg
VVMgYW5kIENFUk4uLi4NCg0KLS0gDQpUcm9uZCBNeWtsZWJ1c3QNCkxpbnV4IE5GUyBjbGllbnQg
bWFpbnRhaW5lcg0KDQpOZXRBcHANClRyb25kLk15a2xlYnVzdEBuZXRhcHAuY29tDQp3d3cubmV0
YXBwLmNvbQ0KDQo=

2012-02-09 18:39:15

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

On Thu, Feb 09, 2012 at 09:37:22AM +0100, Tigran Mkrtchyan wrote:
> Putting my 'high energy physics community' hat on let me comment on it.
>
> As soon as we trying to use nfs over high latency networks the

How high is the latency?

> application efficiency rapidly drops. Efficiency is wallTIme/cpuTime.
> We have solve this in our home grown protocols by adding vector read
> and vector write. Vector is set of offset_length. As most of the our
> files has DB like structure, after reading header (something like
> index) we knew where data is located. This allows us to perform in
> some work loads 100 times better than NFS.
>
> Posix does not provides such interface. But we can simulate that with
> fadvise calls (and we do). Since nfs-4.0 we got compound operations.
> And you can (in theory) build a compound with multiple READ or WRITE
> ops. Nevertheless this does not work for several reasons: maximal
> reply size and you still have to wait for full reply. and some reply
> may be up 100MB in size.
>
> The solution here is to issue multiple requests in parallel. And this
> is possible only if you have enough session slots. Server can reply
> out of order and populate clients file system cache.

Yep.

I'm just curious whether Andy or someone's beeing doing experiments with
these patches, and if so, what they look like.

(The numbers I can find from the one case we worked on at citi (UM to
CERN), were 10 gig * 120ms latency for a 143MB bandwidth-delay product,
so in theory 143 slots would suffice if they were all doing maximum-size
IO--but I can't find any results from NFS tests over that link (only
results for a lower-latency 10gig network).

And, of course, if you're doing smaller operations or have an even
higher-latency network, etc., you could need more slots--I just wondered
abuot the details.)

--b.

2012-02-09 08:37:23

by Mkrtchyan, Tigran

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

Putting my 'high energy physics community' hat on let me comment on it.

As soon as we trying to use nfs over high latency networks the
application efficiency rapidly drops. Efficiency is wallTIme/cpuTime.
We have solve this in our home grown protocols by adding vector read
and vector write. Vector is set of offset_length. As most of the our
files has DB like structure, after reading header (something like
index) we knew where data is located. This allows us to perform in
some work loads 100 times better than NFS.

Posix does not provides such interface. But we can simulate that with
fadvise calls (and we do). Since nfs-4.0 we got compound operations.
And you can (in theory) build a compound with multiple READ or WRITE
ops. Nevertheless this does not work for several reasons: maximal
reply size and you still have to wait for full reply. and some reply
may be up 100MB in size.

The solution here is to issue multiple requests in parallel. And this
is possible only if you have enough session slots. Server can reply
out of order and populate clients file system cache.

Tigran.

On Wed, Feb 8, 2012 at 10:01 PM, Jim Rees <[email protected]> wrote:
> Myklebust, Trond wrote:
>
>  On Wed, 2012-02-08 at 15:31 -0500, Jim Rees wrote:
>  > J. Bruce Fields wrote:
>  >
>  >   On Wed, Feb 08, 2012 at 12:49:01PM -0500, Jim Rees wrote:
>  >   > Myklebust, Trond wrote:
>  >   >
>  >   >   10GigE + high latencies is exactly where we're seeing the value. Andy
>  >   >   has been working with the high energy physics community doing NFS
>  >   >   traffic between the US and CERN...
>  >   >
>  >   > CITI to CERN is just over 120ms.  I don't know what it would be from Andy's
>  >   > house.  Does he have 10G at home yet?
>  >
>  >   That still seems short of what you'd need to get a 255MB bandwidth-delay
>  >   product.
>  >
>  >   I'm just curious what the experiment is here and whether there's a
>  >   possibility the real problem is elsewhere.
>  >
>  > In my opinion, any fix that involves allocating multiple parallel data
>  > streams (rpc slots, tcp connections) is masking the real problem.  But it's
>  > an effective fix.
>
>  Who said anything about multiple tcp connections? All the slots do is
>  allow the server to process more RPC calls in parallel by feeding it
>  more work. How is that masking a problem?
>
> Sorry, the comma was intended to be "or".  I realize there is just one tcp
> connection.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

2012-02-07 00:57:25

by Myklebust, Trond

[permalink] [raw]
Subject: [RFC PATCH 2/2] NFSv4.1: Add a module parameter to set the number of session slots

Add the module parameter 'max_session_slots' to set the initial number
of slots that the NFSv4.1 client will attempt to negotiate with the
server.

Signed-off-by: Trond Myklebust <[email protected]>
---
Documentation/kernel-parameters.txt | 8 ++++++++
fs/nfs/nfs4proc.c | 8 +++++++-
include/linux/nfs_fs_sb.h | 3 ++-
3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 033d4e6..1d369c6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1657,6 +1657,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
of returning the full 64-bit number.
The default is to return 64-bit inode numbers.

+ nfs.max_session_slots=
+ [NFSv4.1] Sets the maximum number of session slots
+ the client will attempt to negotiate with the server.
+ This limits the number of simultaneous RPC requests
+ that the client can send to the NFSv4.1 server.
+ Note that there is little point in setting this
+ value higher than the max_tcp_slot_table_limit.
+
nfs.nfs4_disable_idmapping=
[NFSv4] When set to the default of '1', this option
ensures that both the RPC level authentication
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0dc5dfb..da985c3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -72,6 +72,8 @@

#define NFS4_MAX_LOOP_ON_RECOVER (10)

+static unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
+
struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
@@ -5246,7 +5248,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
args->fc_attrs.max_rqst_sz = mxrqst_sz;
args->fc_attrs.max_resp_sz = mxresp_sz;
args->fc_attrs.max_ops = NFS4_MAX_OPS;
- args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs;
+ args->fc_attrs.max_reqs = max_session_slots;

dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u "
"max_ops=%u max_reqs=%u\n",
@@ -6391,6 +6393,10 @@ const struct xattr_handler *nfs4_xattr_handlers[] = {
NULL
};

+module_param(max_session_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
+ "requests the client will negotiate");
+
/*
* Local variables:
* c-basic-offset: 8
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 2ae57f2..ba1e08d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -190,7 +190,8 @@ struct nfs_server {


/* maximum number of slots to use */
-#define NFS4_MAX_SLOT_TABLE (128U)
+#define NFS4_DEF_SLOT_TABLE_SIZE (16U)
+#define NFS4_MAX_SLOT_TABLE (255U)
#define NFS4_NO_SLOT ((u32)-1)

#if defined(CONFIG_NFS_V4)
--
1.7.7.6


2012-02-08 07:34:08

by Benny Halevy

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

I reviewed both patches and they look good to me.

Benny

On 2012-02-07 02:57, Trond Myklebust wrote:
> It is perfectly legal to negotiate up to 2^32-1 slots in the protocol,
> and with 10GigE, we are already seeing that 255 slots is far too limiting.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfs/callback.h | 2 +-
> fs/nfs/callback_xdr.c | 6 +++---
> fs/nfs/nfs4proc.c | 21 ++++++++++-----------
> include/linux/nfs_fs_sb.h | 7 ++++---
> include/linux/nfs_xdr.h | 2 +-
> 5 files changed, 19 insertions(+), 19 deletions(-)
>
> diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
> index 197e0d3..a5527c9 100644
> --- a/fs/nfs/callback.h
> +++ b/fs/nfs/callback.h
> @@ -38,7 +38,7 @@ enum nfs4_callback_opnum {
> struct cb_process_state {
> __be32 drc_status;
> struct nfs_client *clp;
> - int slotid;
> + u32 slotid;
> struct net *net;
> };
>
> diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
> index 2e37224..5466829 100644
> --- a/fs/nfs/callback_xdr.c
> +++ b/fs/nfs/callback_xdr.c
> @@ -759,14 +759,14 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
> * Let the state manager know callback processing done.
> * A single slot, so highest used slotid is either 0 or -1
> */
> - tbl->highest_used_slotid = -1;
> + tbl->highest_used_slotid = NFS4_NO_SLOT;
> nfs4_check_drain_bc_complete(session);
> spin_unlock(&tbl->slot_tbl_lock);
> }
>
> static void nfs4_cb_free_slot(struct cb_process_state *cps)
> {
> - if (cps->slotid != -1)
> + if (cps->slotid != NFS4_NO_SLOT)
> nfs4_callback_free_slot(cps->clp->cl_session);
> }
>
> @@ -860,7 +860,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
> struct cb_process_state cps = {
> .drc_status = 0,
> .clp = NULL,
> - .slotid = -1,
> + .slotid = NFS4_NO_SLOT,
> .net = rqstp->rq_xprt->xpt_net,
> };
> unsigned int nops = 0;
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 482ed97..0dc5dfb 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -402,7 +402,7 @@ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
> return;
> }
>
> - if (ses->fc_slot_table.highest_used_slotid != -1)
> + if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
> return;
>
> dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
> @@ -415,7 +415,7 @@ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
> void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
> {
> if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
> - ses->bc_slot_table.highest_used_slotid != -1)
> + ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
> return;
> dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
> complete(&ses->bc_slot_table.complete);
> @@ -514,14 +514,13 @@ static int nfs4_sequence_done(struct rpc_task *task,
> *
> * Note: must be called with under the slot_tbl_lock.
> */
> -static u8
> +static u32
> nfs4_find_slot(struct nfs4_slot_table *tbl)
> {
> - int slotid;
> - u8 ret_id = NFS4_MAX_SLOT_TABLE;
> - BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
> + u32 slotid;
> + u32 ret_id = NFS4_NO_SLOT;
>
> - dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n",
> + dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
> __func__, tbl->used_slots[0], tbl->highest_used_slotid,
> tbl->max_slots);
> slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
> @@ -555,7 +554,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
> {
> struct nfs4_slot *slot;
> struct nfs4_slot_table *tbl;
> - u8 slotid;
> + u32 slotid;
>
> dprintk("--> %s\n", __func__);
> /* slot already allocated? */
> @@ -5144,7 +5143,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
> spin_lock(&tbl->slot_tbl_lock);
> tbl->max_slots = max_slots;
> tbl->slots = slot;
> - tbl->highest_used_slotid = -1; /* no slot is currently used */
> + tbl->highest_used_slotid = NFS4_NO_SLOT; /* no slot is currently used */
> spin_unlock(&tbl->slot_tbl_lock);
> dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
> tbl, tbl->slots, tbl->max_slots);
> @@ -5196,13 +5195,13 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
> return NULL;
>
> tbl = &session->fc_slot_table;
> - tbl->highest_used_slotid = -1;
> + tbl->highest_used_slotid = NFS4_NO_SLOT;
> spin_lock_init(&tbl->slot_tbl_lock);
> rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
> init_completion(&tbl->complete);
>
> tbl = &session->bc_slot_table;
> - tbl->highest_used_slotid = -1;
> + tbl->highest_used_slotid = NFS4_NO_SLOT;
> spin_lock_init(&tbl->slot_tbl_lock);
> rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
> init_completion(&tbl->complete);
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 9e101c1..2ae57f2 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -191,6 +191,7 @@ struct nfs_server {
>
> /* maximum number of slots to use */
> #define NFS4_MAX_SLOT_TABLE (128U)
> +#define NFS4_NO_SLOT ((u32)-1)
>
> #if defined(CONFIG_NFS_V4)
>
> @@ -201,10 +202,10 @@ struct nfs4_slot_table {
> unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
> spinlock_t slot_tbl_lock;
> struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */
> - int max_slots; /* # slots in table */
> - int highest_used_slotid; /* sent to server on each SEQ.
> + u32 max_slots; /* # slots in table */
> + u32 highest_used_slotid; /* sent to server on each SEQ.
> * op for dynamic resizing */
> - int target_max_slots; /* Set by CB_RECALL_SLOT as
> + u32 target_max_slots; /* Set by CB_RECALL_SLOT as
> * the new max_slots */
> struct completion complete;
> };
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index 144419a..adbc84a 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -181,7 +181,7 @@ struct nfs4_slot {
>
> struct nfs4_sequence_args {
> struct nfs4_session *sa_session;
> - u8 sa_slotid;
> + u32 sa_slotid;
> u8 sa_cache_this;
> };
>

2012-02-10 16:06:31

by Andy Adamson

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

On Thu, Feb 9, 2012 at 1:39 PM, J. Bruce Fields <[email protected]> wrote:
> On Thu, Feb 09, 2012 at 09:37:22AM +0100, Tigran Mkrtchyan wrote:
>> Putting my 'high energy physics community' hat on let me comment on it.
>>
>> As soon as we trying to use nfs over high latency networks the
>
> How high is the latency?
>
>> application efficiency rapidly drops. Efficiency is wallTIme/cpuTime.
>> We have solve this in our home grown protocols by adding vector read
>> and vector write. Vector is set of offset_length. As most of the our
>> files has DB like structure, after reading header (something like
>> index) we knew where data is located. This allows us to perform in
>> some work loads 100 times better than NFS.
>>
>> Posix does not provides such interface. But we can simulate that with
>> fadvise calls (and we do). Since nfs-4.0 we got compound ?operations.
>> And you can (in theory) build a compound with multiple READ or WRITE
>> ops. Nevertheless this does not work for several reasons: maximal
>> reply size and you still have to wait for full reply. and some reply
>> may be up 100MB in size.
>>
>> The solution here is to issue multiple requests in parallel. And this
>> is possible only if you have enough session slots. Server can reply
>> out of order and populate clients file system cache.
>
> Yep.
>
> I'm just curious whether Andy or someone's beeing doing experiments with
> these patches, and if so, what they look like.
>
> (The numbers I can find from the one case we worked on at citi (UM to
> CERN), were 10 gig * 120ms latency for a 143MB bandwidth-delay product,
> so in theory 143 slots would suffice if they were all doing maximum-size

The net I tested with has a 127ms delay (~152MB bandwitdth-delay
product) - the same ballpark as 120ms.
As you mention, you are assuming an rsize/wsize of 1 MB. Think of a
server that only supports 64k and you need a lot more slots (143 * 16
= 2288 slots) to fill the 143MB bandwidth-delay product. A 64K r/wsize
server with 255 slots on a 10G net could only fill a 13ms latency 10G
net.

Plus - 10GB nets are old tech! The CERN/UMICH machines I was working
with had 40GB NICs. 100GB nets are on the way...

-->Andy

> IO--but I can't find any results from NFS tests over that link (only
> results for a lower-latency 10gig network).
>
> And, of course, if you're doing smaller operations or have an even
> higher-latency network, etc., you could need more slots--I just wondered
> abuot the details.)
>
> --b.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html

2012-02-08 16:23:23

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

On Mon, Feb 06, 2012 at 07:57:16PM -0500, Trond Myklebust wrote:
> It is perfectly legal to negotiate up to 2^32-1 slots in the protocol,
> and with 10GigE, we are already seeing that 255 slots is far too limiting.

Seems like an entirely reasonable change, but just out of curiosity,
what workload are you using to hit that limit?

(If it's just bulk IO over a local 10GigE network, then surely the
problem is elsewhere? Even on 10GigE I think you'd need fairly high
latencies (a little over 200ms, if my math is right?) to keep that much
data in flight. Or are you testing much smaller operations?)

--b.

>
> Signed-off-by: Trond Myklebust <[email protected]>
> ---
> fs/nfs/callback.h | 2 +-
> fs/nfs/callback_xdr.c | 6 +++---
> fs/nfs/nfs4proc.c | 21 ++++++++++-----------
> include/linux/nfs_fs_sb.h | 7 ++++---
> include/linux/nfs_xdr.h | 2 +-
> 5 files changed, 19 insertions(+), 19 deletions(-)
>
> diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
> index 197e0d3..a5527c9 100644
> --- a/fs/nfs/callback.h
> +++ b/fs/nfs/callback.h
> @@ -38,7 +38,7 @@ enum nfs4_callback_opnum {
> struct cb_process_state {
> __be32 drc_status;
> struct nfs_client *clp;
> - int slotid;
> + u32 slotid;
> struct net *net;
> };
>
> diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
> index 2e37224..5466829 100644
> --- a/fs/nfs/callback_xdr.c
> +++ b/fs/nfs/callback_xdr.c
> @@ -759,14 +759,14 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
> * Let the state manager know callback processing done.
> * A single slot, so highest used slotid is either 0 or -1
> */
> - tbl->highest_used_slotid = -1;
> + tbl->highest_used_slotid = NFS4_NO_SLOT;
> nfs4_check_drain_bc_complete(session);
> spin_unlock(&tbl->slot_tbl_lock);
> }
>
> static void nfs4_cb_free_slot(struct cb_process_state *cps)
> {
> - if (cps->slotid != -1)
> + if (cps->slotid != NFS4_NO_SLOT)
> nfs4_callback_free_slot(cps->clp->cl_session);
> }
>
> @@ -860,7 +860,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
> struct cb_process_state cps = {
> .drc_status = 0,
> .clp = NULL,
> - .slotid = -1,
> + .slotid = NFS4_NO_SLOT,
> .net = rqstp->rq_xprt->xpt_net,
> };
> unsigned int nops = 0;
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 482ed97..0dc5dfb 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -402,7 +402,7 @@ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
> return;
> }
>
> - if (ses->fc_slot_table.highest_used_slotid != -1)
> + if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
> return;
>
> dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
> @@ -415,7 +415,7 @@ static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
> void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
> {
> if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
> - ses->bc_slot_table.highest_used_slotid != -1)
> + ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
> return;
> dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
> complete(&ses->bc_slot_table.complete);
> @@ -514,14 +514,13 @@ static int nfs4_sequence_done(struct rpc_task *task,
> *
> * Note: must be called with under the slot_tbl_lock.
> */
> -static u8
> +static u32
> nfs4_find_slot(struct nfs4_slot_table *tbl)
> {
> - int slotid;
> - u8 ret_id = NFS4_MAX_SLOT_TABLE;
> - BUILD_BUG_ON((u8)NFS4_MAX_SLOT_TABLE != (int)NFS4_MAX_SLOT_TABLE);
> + u32 slotid;
> + u32 ret_id = NFS4_NO_SLOT;
>
> - dprintk("--> %s used_slots=%04lx highest_used=%d max_slots=%d\n",
> + dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
> __func__, tbl->used_slots[0], tbl->highest_used_slotid,
> tbl->max_slots);
> slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
> @@ -555,7 +554,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
> {
> struct nfs4_slot *slot;
> struct nfs4_slot_table *tbl;
> - u8 slotid;
> + u32 slotid;
>
> dprintk("--> %s\n", __func__);
> /* slot already allocated? */
> @@ -5144,7 +5143,7 @@ static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
> spin_lock(&tbl->slot_tbl_lock);
> tbl->max_slots = max_slots;
> tbl->slots = slot;
> - tbl->highest_used_slotid = -1; /* no slot is currently used */
> + tbl->highest_used_slotid = NFS4_NO_SLOT; /* no slot is currently used */
> spin_unlock(&tbl->slot_tbl_lock);
> dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
> tbl, tbl->slots, tbl->max_slots);
> @@ -5196,13 +5195,13 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
> return NULL;
>
> tbl = &session->fc_slot_table;
> - tbl->highest_used_slotid = -1;
> + tbl->highest_used_slotid = NFS4_NO_SLOT;
> spin_lock_init(&tbl->slot_tbl_lock);
> rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
> init_completion(&tbl->complete);
>
> tbl = &session->bc_slot_table;
> - tbl->highest_used_slotid = -1;
> + tbl->highest_used_slotid = NFS4_NO_SLOT;
> spin_lock_init(&tbl->slot_tbl_lock);
> rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
> init_completion(&tbl->complete);
> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
> index 9e101c1..2ae57f2 100644
> --- a/include/linux/nfs_fs_sb.h
> +++ b/include/linux/nfs_fs_sb.h
> @@ -191,6 +191,7 @@ struct nfs_server {
>
> /* maximum number of slots to use */
> #define NFS4_MAX_SLOT_TABLE (128U)
> +#define NFS4_NO_SLOT ((u32)-1)
>
> #if defined(CONFIG_NFS_V4)
>
> @@ -201,10 +202,10 @@ struct nfs4_slot_table {
> unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
> spinlock_t slot_tbl_lock;
> struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */
> - int max_slots; /* # slots in table */
> - int highest_used_slotid; /* sent to server on each SEQ.
> + u32 max_slots; /* # slots in table */
> + u32 highest_used_slotid; /* sent to server on each SEQ.
> * op for dynamic resizing */
> - int target_max_slots; /* Set by CB_RECALL_SLOT as
> + u32 target_max_slots; /* Set by CB_RECALL_SLOT as
> * the new max_slots */
> struct completion complete;
> };
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index 144419a..adbc84a 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -181,7 +181,7 @@ struct nfs4_slot {
>
> struct nfs4_sequence_args {
> struct nfs4_session *sa_session;
> - u8 sa_slotid;
> + u32 sa_slotid;
> u8 sa_cache_this;
> };
>
> --
> 1.7.7.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2012-02-08 17:49:07

by Jim Rees

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

Myklebust, Trond wrote:

10GigE + high latencies is exactly where we're seeing the value. Andy
has been working with the high energy physics community doing NFS
traffic between the US and CERN...

CITI to CERN is just over 120ms. I don't know what it would be from Andy's
house. Does he have 10G at home yet?

2012-02-08 20:31:45

by Jim Rees

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

J. Bruce Fields wrote:

On Wed, Feb 08, 2012 at 12:49:01PM -0500, Jim Rees wrote:
> Myklebust, Trond wrote:
>
> 10GigE + high latencies is exactly where we're seeing the value. Andy
> has been working with the high energy physics community doing NFS
> traffic between the US and CERN...
>
> CITI to CERN is just over 120ms. I don't know what it would be from Andy's
> house. Does he have 10G at home yet?

That still seems short of what you'd need to get a 255MB bandwidth-delay
product.

I'm just curious what the experiment is here and whether there's a
possibility the real problem is elsewhere.

In my opinion, any fix that involves allocating multiple parallel data
streams (rpc slots, tcp connections) is masking the real problem. But it's
an effective fix.

2012-02-08 18:31:54

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

On Wed, Feb 08, 2012 at 12:49:01PM -0500, Jim Rees wrote:
> Myklebust, Trond wrote:
>
> 10GigE + high latencies is exactly where we're seeing the value. Andy
> has been working with the high energy physics community doing NFS
> traffic between the US and CERN...
>
> CITI to CERN is just over 120ms. I don't know what it would be from Andy's
> house. Does he have 10G at home yet?

That still seems short of what you'd need to get a 255MB bandwidth-delay
product.

I'm just curious what the experiment is here and whether there's a
possibility the real problem is elsewhere.

--b.

2012-02-08 21:01:57

by Jim Rees

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] NFSv4.1: Convert slotid from u8 to u32

Myklebust, Trond wrote:

On Wed, 2012-02-08 at 15:31 -0500, Jim Rees wrote:
> J. Bruce Fields wrote:
>
> On Wed, Feb 08, 2012 at 12:49:01PM -0500, Jim Rees wrote:
> > Myklebust, Trond wrote:
> >
> > 10GigE + high latencies is exactly where we're seeing the value. Andy
> > has been working with the high energy physics community doing NFS
> > traffic between the US and CERN...
> >
> > CITI to CERN is just over 120ms. I don't know what it would be from Andy's
> > house. Does he have 10G at home yet?
>
> That still seems short of what you'd need to get a 255MB bandwidth-delay
> product.
>
> I'm just curious what the experiment is here and whether there's a
> possibility the real problem is elsewhere.
>
> In my opinion, any fix that involves allocating multiple parallel data
> streams (rpc slots, tcp connections) is masking the real problem. But it's
> an effective fix.

Who said anything about multiple tcp connections? All the slots do is
allow the server to process more RPC calls in parallel by feeding it
more work. How is that masking a problem?

Sorry, the comma was intended to be "or". I realize there is just one tcp
connection.