Hey-
Here are four patches that address recent issues discovered in 2.6.38.
They should be ready for inclusion upstream. I say that because that
guarantees you will find bugs in them.
---
Chuck Lever (4):
NFS: Prevent memory allocation failure in nfsacl_encode()
NFS: nfsacl_{encode,decode} should return signed integer
NFS: Fix "kernel BUG at fs/nfs/nfs3xdr.c:1338!"
NFS: Fix "kernel BUG at fs/aio.c:554!"
fs/nfs/direct.c | 30 +++++++++++++------------
fs/nfs/nfs3acl.c | 4 ++-
fs/nfs/nfs3xdr.c | 5 +++-
fs/nfs_common/nfsacl.c | 54 ++++++++++++++++++++++++++++++++++-----------
fs/posix_acl.c | 17 +++++++++++---
include/linux/nfsacl.h | 4 ++-
include/linux/posix_acl.h | 1 +
7 files changed, 79 insertions(+), 36 deletions(-)
--
Chuck Lever
Clean up.
The nfsacl_encode() and nfsacl_decode() functions return negative
errno values, and each call site verifies that the returned value
is not negative. Change the synopsis of both of these functions
to reflect this usage.
Document the synopsis and return values.
Reported-by: Trond Myklebust <[email protected]>
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfs_common/nfsacl.c | 32 ++++++++++++++++++++++++++------
include/linux/nfsacl.h | 4 ++--
2 files changed, 28 insertions(+), 8 deletions(-)
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index fc1c525..a3e78bd 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -72,9 +72,20 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
return 0;
}
-unsigned int
-nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
- struct posix_acl *acl, int encode_entries, int typeflag)
+/**
+ * nfsacl_encode - Encode an NFSv3 ACL
+ *
+ * @buf: destination xdr_buf to contain XDR encoded ACL
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @inode: inode of file whose ACL this is
+ * @acl: posix_acl to encode
+ * @encode_entries: whether to encode ACEs as well
+ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
+ *
+ * Returns size of encoded ACL in bytes or a negative errno value.
+ */
+int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+ struct posix_acl *acl, int encode_entries, int typeflag)
{
int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
struct nfsacl_encode_desc nfsacl_desc = {
@@ -224,9 +235,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
return 0;
}
-unsigned int
-nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
- struct posix_acl **pacl)
+/**
+ * nfsacl_decode - Decode an NFSv3 ACL
+ *
+ * @buf: xdr_buf containing XDR'd ACL data to decode
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @aclcnt: count of ACEs in decoded posix_acl
+ * @pacl: buffer in which to place decoded posix_acl
+ *
+ * Returns the length of the decoded ACL in bytes, or a negative errno value.
+ */
+int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+ struct posix_acl **pacl)
{
struct nfsacl_decode_desc nfsacl_desc = {
.desc = {
diff --git a/include/linux/nfsacl.h b/include/linux/nfsacl.h
index f321b57..fabcb1e 100644
--- a/include/linux/nfsacl.h
+++ b/include/linux/nfsacl.h
@@ -51,10 +51,10 @@ nfsacl_size(struct posix_acl *acl_access, struct posix_acl *acl_default)
return w;
}
-extern unsigned int
+extern int
nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
struct posix_acl *acl, int encode_entries, int typeflag);
-extern unsigned int
+extern int
nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
struct posix_acl **pacl);
nfsacl_encode() allocates memory in certain cases. This of course
is not guaranteed to work.
Since commit 9f06c719 "SUNRPC: New xdr_streams XDR encoder API", the
kernel's XDR encoders can't return a result indicating possibly a
failure, so a memory allocation failure in nfsacl_encode() has become
fatal (ie, the XDR code Oopses) in some cases.
However, the allocated memory is a tiny fixed amount, on the order
of 40-50 bytes. We can easily use a stack-allocated buffer for
this, with only a wee bit of nose-holding.
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfs/nfs3acl.c | 4 ++--
fs/nfs_common/nfsacl.c | 22 +++++++++++++++-------
fs/posix_acl.c | 17 +++++++++++++----
include/linux/posix_acl.h | 1 +
4 files changed, 31 insertions(+), 13 deletions(-)
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9f88c5f..2743427 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -311,8 +311,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
if (!nfs_server_capable(inode, NFS_CAP_ACLS))
goto out;
- /* We are doing this here, because XDR marshalling can only
- return -ENOMEM. */
+ /* We are doing this here because XDR marshalling does not
+ * return any results, it BUGs. */
status = -ENOSPC;
if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
goto out;
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index a3e78bd..84c27d6 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -42,6 +42,11 @@ struct nfsacl_encode_desc {
gid_t gid;
};
+struct nfsacl_simple_acl {
+ struct posix_acl acl;
+ struct posix_acl_entry ace[4];
+};
+
static int
xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
{
@@ -99,17 +104,22 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
.uid = inode->i_uid,
.gid = inode->i_gid,
};
+ struct nfsacl_simple_acl aclbuf;
int err;
- struct posix_acl *acl2 = NULL;
if (entries > NFS_ACL_MAX_ENTRIES ||
xdr_encode_word(buf, base, entries))
return -EINVAL;
if (encode_entries && acl && acl->a_count == 3) {
- /* Fake up an ACL_MASK entry. */
- acl2 = posix_acl_alloc(4, GFP_KERNEL);
- if (!acl2)
- return -ENOMEM;
+ struct posix_acl *acl2 = &aclbuf.acl;
+
+ /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is
+ * invoked in contexts where a memory allocation failure is
+ * fatal. Fortunately this fake ACL is small enough to
+ * construct on the stack. */
+ memset(acl2, 0, sizeof(acl2));
+ posix_acl_init(acl2, 4);
+
/* Insert entries in canonical order: other orders seem
to confuse Solaris VxFS. */
acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */
@@ -120,8 +130,6 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
nfsacl_desc.acl = acl2;
}
err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
- if (acl2)
- posix_acl_release(acl2);
if (!err)
err = 8 + nfsacl_desc.desc.elem_size *
nfsacl_desc.desc.array_len;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 39df95a..b1cf6bf 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,6 +22,7 @@
#include <linux/errno.h>
+EXPORT_SYMBOL(posix_acl_init);
EXPORT_SYMBOL(posix_acl_alloc);
EXPORT_SYMBOL(posix_acl_clone);
EXPORT_SYMBOL(posix_acl_valid);
@@ -32,6 +33,16 @@ EXPORT_SYMBOL(posix_acl_chmod_masq);
EXPORT_SYMBOL(posix_acl_permission);
/*
+ * Init a fresh posix_acl
+ */
+void
+posix_acl_init(struct posix_acl *acl, int count)
+{
+ atomic_set(&acl->a_refcount, 1);
+ acl->a_count = count;
+}
+
+/*
* Allocate a new ACL with the specified number of entries.
*/
struct posix_acl *
@@ -40,10 +51,8 @@ posix_acl_alloc(int count, gfp_t flags)
const size_t size = sizeof(struct posix_acl) +
count * sizeof(struct posix_acl_entry);
struct posix_acl *acl = kmalloc(size, flags);
- if (acl) {
- atomic_set(&acl->a_refcount, 1);
- acl->a_count = count;
- }
+ if (acl)
+ posix_acl_init(acl, count);
return acl;
}
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index d68283a..54211c1 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -71,6 +71,7 @@ posix_acl_release(struct posix_acl *acl)
/* posix_acl.c */
+extern void posix_acl_init(struct posix_acl *, int);
extern struct posix_acl *posix_acl_alloc(int, gfp_t);
extern struct posix_acl *posix_acl_clone(const struct posix_acl *, gfp_t);
extern int posix_acl_valid(const struct posix_acl *);
On 11-01-20 22:05, Chuck Lever wrote:
> Nick Piggin reports:
>
> > I'm getting use after frees in aio code in NFS
> >
> > [ 2703.396766] Call Trace:
> > [ 2703.396858] [<ffffffff8100b057>] ? native_sched_clock+0x27/0x80
> > [ 2703.396959] [<ffffffff8108509e>] ? put_lock_stats+0xe/0x40
> > [ 2703.397058] [<ffffffff81088348>] ? lock_release_holdtime+0xa8/0x140
> > [ 2703.397159] [<ffffffff8108a2a5>] lock_acquire+0x95/0x1b0
> > [ 2703.397260] [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
> > [ 2703.397361] [<ffffffff81039701>] ? get_parent_ip+0x11/0x50
> > [ 2703.397464] [<ffffffff81612a31>] _raw_spin_lock_irq+0x41/0x80
> > [ 2703.397564] [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
> > [ 2703.397662] [<ffffffff811627db>] aio_put_req+0x2b/0x60
> > [ 2703.397761] [<ffffffff811647fe>] do_io_submit+0x2be/0x7c0
> > [ 2703.397895] [<ffffffff81164d0b>] sys_io_submit+0xb/0x10
> > [ 2703.397995] [<ffffffff8100307b>] system_call_fastpath+0x16/0x1b
> >
> > Adding some tracing, it is due to nfs completing the request then
> > returning something other than -EIOCBQUEUED, so aio.c
> > also completes the request.
>
> To address this, prevent the NFS direct I/O engine from completing
> async iocbs when the forward path returns an error without starting
> any I/O.
>
> This fix appears to survive ^C during both "xfstest no. 208" and "fsx
> -Z."
>
> It's likely this bug has existed for a very long while, as we are seeing
> very similar symptoms in OEL 5. Copying stable.
>
> Cc: Stable <[email protected]>
> Signed-off-by: Chuck Lever <[email protected]>
> ---
>
> fs/nfs/direct.c | 30 ++++++++++++++++--------------
> 1 files changed, 16 insertions(+), 14 deletions(-)
>
> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> index e6ace0d..bde25ca 100644
> --- a/fs/nfs/direct.c
> +++ b/fs/nfs/direct.c
> @@ -407,15 +407,16 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
> pos += vec->iov_len;
> }
>
> + /*
> + * If no bytes were started, return the error, and let the
> + * generic layer handle the completion.
> + */
> + if (requested_bytes == 0)
> + return result < 0 ? result : -EIO;
> +
> if (put_dreq(dreq))
> nfs_direct_complete(dreq);
Same comment as I wrote in another thread:
put_dreq() -> nfs_direct_complete() does more than complete the aio its self.
It also drops ref on dreq with put_dreq() and does
complete_all(&dreq->completion);
nfs_direct_req_release(dreq);
I think we still needs that called somewhere.
regards,
wengang.
> -
> - if (requested_bytes != 0)
> - return 0;
> -
> - if (result < 0)
> - return result;
> - return -EIO;
> + return 0;
> }
>
> static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
> @@ -841,15 +842,16 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
> pos += vec->iov_len;
> }
>
> + /*
> + * If no bytes were started, return the error, and let the
> + * generic layer handle the completion.
> + */
> + if (requested_bytes == 0)
> + return result < 0 ? result : -EIO;
> +
> if (put_dreq(dreq))
> nfs_direct_write_complete(dreq, dreq->inode);
> -
> - if (requested_bytes != 0)
> - return 0;
> -
> - if (result < 0)
> - return result;
> - return -EIO;
> + return 0;
> }
>
> static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Nick Piggin reports:
> I'm getting use after frees in aio code in NFS
>
> [ 2703.396766] Call Trace:
> [ 2703.396858] [<ffffffff8100b057>] ? native_sched_clock+0x27/0x80
> [ 2703.396959] [<ffffffff8108509e>] ? put_lock_stats+0xe/0x40
> [ 2703.397058] [<ffffffff81088348>] ? lock_release_holdtime+0xa8/0x140
> [ 2703.397159] [<ffffffff8108a2a5>] lock_acquire+0x95/0x1b0
> [ 2703.397260] [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
> [ 2703.397361] [<ffffffff81039701>] ? get_parent_ip+0x11/0x50
> [ 2703.397464] [<ffffffff81612a31>] _raw_spin_lock_irq+0x41/0x80
> [ 2703.397564] [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
> [ 2703.397662] [<ffffffff811627db>] aio_put_req+0x2b/0x60
> [ 2703.397761] [<ffffffff811647fe>] do_io_submit+0x2be/0x7c0
> [ 2703.397895] [<ffffffff81164d0b>] sys_io_submit+0xb/0x10
> [ 2703.397995] [<ffffffff8100307b>] system_call_fastpath+0x16/0x1b
>
> Adding some tracing, it is due to nfs completing the request then
> returning something other than -EIOCBQUEUED, so aio.c
> also completes the request.
To address this, prevent the NFS direct I/O engine from completing
async iocbs when the forward path returns an error without starting
any I/O.
This fix appears to survive ^C during both "xfstest no. 208" and "fsx
-Z."
It's likely this bug has existed for a very long while, as we are seeing
very similar symptoms in OEL 5. Copying stable.
Cc: Stable <[email protected]>
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfs/direct.c | 30 ++++++++++++++++--------------
1 files changed, 16 insertions(+), 14 deletions(-)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e6ace0d..bde25ca 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -407,15 +407,16 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
pos += vec->iov_len;
}
+ /*
+ * If no bytes were started, return the error, and let the
+ * generic layer handle the completion.
+ */
+ if (requested_bytes == 0)
+ return result < 0 ? result : -EIO;
+
if (put_dreq(dreq))
nfs_direct_complete(dreq);
-
- if (requested_bytes != 0)
- return 0;
-
- if (result < 0)
- return result;
- return -EIO;
+ return 0;
}
static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
@@ -841,15 +842,16 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
pos += vec->iov_len;
}
+ /*
+ * If no bytes were started, return the error, and let the
+ * generic layer handle the completion.
+ */
+ if (requested_bytes == 0)
+ return result < 0 ? result : -EIO;
+
if (put_dreq(dreq))
nfs_direct_write_complete(dreq, dreq->inode);
-
- if (requested_bytes != 0)
- return 0;
-
- if (result < 0)
- return result;
- return -EIO;
+ return 0;
}
static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
Milan Broz <[email protected]> reports:
> on today Linus' tree I get OOps if using nfs.
>
> server (2.6.36) exports dir:
> /dir 172.16.1.0/24(rw,async,all_squash,no_subtree_check,anonuid=500,anongid=500)
>
> on client it is mounted in fstab
> server:/dir /mnt/tst nfs rw,soft 0 0
>
> and these commands OOpses it (simplified from a configure script):
>
> cd /dir
> touch x
> install x y
>
> [ 105.327701] ------------[ cut here ]------------
> [ 105.327979] kernel BUG at fs/nfs/nfs3xdr.c:1338!
> [ 105.328075] invalid opcode: 0000 [#1] PREEMPT SMP
> [ 105.328223] last sysfs file: /sys/devices/virtual/bdi/0:16/uevent
> [ 105.328349] Modules linked in: usbcore dm_mod
> [ 105.328553]
> [ 105.328678] Pid: 3710, comm: install Not tainted 2.6.37+ #423 440BX Desktop Reference Platform/VMware Virtual Platform
> [ 105.328853] EIP: 0060:[<c116c06c>] EFLAGS: 00010282 CPU: 0
> [ 105.329152] EIP is at nfs3_xdr_enc_setacl3args+0x61/0x98
> [ 105.329249] EAX: ffffffea EBX: ce941d98 ECX: 00000000 EDX: 00000004
> [ 105.329340] ESI: ce941cd0 EDI: 000000a4 EBP: ce941cc0 ESP: ce941cb4
> [ 105.329431] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
> [ 105.329525] Process install (pid: 3710, ti=ce940000 task=ced36f20 task.ti=ce940000)
> [ 105.336600] Stack:
> [ 105.336693] ce941cd0 ce9dc000 00000000 ce941cf8 c12ecd02 c12f43e0 c116c00b cf754158
> [ 105.336982] ce9dc004 cf754284 ce9dc004 cf7ffee8 ceff9978 ce9dc000 cf7ffee8 ce9dc000
> [ 105.337182] ce9dc000 ce941d14 c12e698d cf75412c ce941d98 cf7ffee8 cf7fff20 00000000
> [ 105.337405] Call Trace:
> [ 105.337695] [<c12ecd02>] rpcauth_wrap_req+0x75/0x7f
> [ 105.337806] [<c12f43e0>] ? xdr_encode_opaque+0x12/0x15
> [ 105.337898] [<c116c00b>] ? nfs3_xdr_enc_setacl3args+0x0/0x98
> [ 105.337988] [<c12e698d>] call_transmit+0x17e/0x1e8
> [ 105.338072] [<c12ec307>] __rpc_execute+0x6d/0x1a6
> [ 105.338155] [<c12ec474>] rpc_execute+0x34/0x37
> [ 105.338235] [<c12e738d>] rpc_run_task+0xb5/0xbd
> [ 105.338316] [<c12e7474>] rpc_call_sync+0x3d/0x58
> [ 105.338402] [<c116d0c6>] nfs3_proc_setacls+0x18e/0x24f
> [ 105.338493] [<c10b3f76>] ? __kmalloc+0x148/0x1c4
> [ 105.338579] [<c10ecd01>] ? posix_acl_alloc+0x12/0x22
> [ 105.338665] [<c116d5c8>] nfs3_proc_setacl+0xa0/0xca
> [ 105.338748] [<c116d69c>] nfs3_setxattr+0x62/0x88
> [ 105.338834] [<c1317042>] ? sub_preempt_count+0x7c/0x89
> [ 105.338926] [<c116d63a>] ? nfs3_setxattr+0x0/0x88
> [ 105.339026] [<c10cfa79>] __vfs_setxattr_noperm+0x26/0x95
> [ 105.339114] [<c10cfb43>] vfs_setxattr+0x5b/0x76
> [ 105.339211] [<c10cfbfb>] setxattr+0x9d/0xc3
> [ 105.339298] [<c10a2ea8>] ? handle_pte_fault+0x258/0x5cb
> [ 105.339428] [<c1091ff6>] ? __free_pages+0x1a/0x23
> [ 105.339517] [<c10498ea>] ? up_read+0x16/0x2c
> [ 105.339599] [<c10b8365>] ? fget+0x0/0xa3
> [ 105.339677] [<c10b8365>] ? fget+0x0/0xa3
> [ 105.339760] [<c1025d23>] ? get_parent_ip+0xb/0x31
> [ 105.339843] [<c1317042>] ? sub_preempt_count+0x7c/0x89
> [ 105.339931] [<c10cfc72>] sys_fsetxattr+0x51/0x79
> [ 105.340014] [<c1002853>] sysenter_do_call+0x12/0x32
> [ 105.340133] Code: 2e 76 18 00 58 31 d2 8b 7f 28 f6 43 04 01 74 03 8b 53 08 6a 00 8b 46 04 6a 01 8b 0b 52 89 fa e8 85 10 f8 ff 83 c4 0c 85 c0 79 04 <0f> 0b eb fe 31 c9 f6 43 04 04 74 03 8b 4b 0c 68 00 10 00 00 8d
> [ 105.350321] EIP: [<c116c06c>] nfs3_xdr_enc_setacl3args+0x61/0x98 SS:ESP 0068:ce941cb4
> [ 105.364385] ---[ end trace 01fcfe7f0f7f6e4a ]---
nfs3_xdr_enc_setacl3args() is not properly setting up the target
buffer before nfsacl_encode() attempts to encode the ACL.
Introduced by commit d9c407b1 "NFS: Introduce new-style XDR encoding
functions for NFSv3."
Signed-off-by: Chuck Lever <[email protected]>
---
fs/nfs/nfs3xdr.c | 5 ++++-
1 files changed, 4 insertions(+), 1 deletions(-)
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 01c5e8b..183c6b1 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1328,10 +1328,13 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
encode_nfs_fh3(xdr, NFS_FH(args->inode));
encode_uint32(xdr, args->mask);
+
+ base = req->rq_slen;
if (args->npages != 0)
xdr_write_pages(xdr, args->pages, 0, args->len);
+ else
+ xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
- base = req->rq_slen;
error = nfsacl_encode(xdr->buf, base, args->inode,
(args->mask & NFS_ACL) ?
args->acl_access : NULL, 1, 0);