2022-11-14 05:32:05

by Xiubo Li

[permalink] [raw]
Subject: [PATCH 0/2 v2] ceph: fix the use-after-free bug for file_lock

From: Xiubo Li <[email protected]>

Changed in V2:
- switch to file_lock.fl_u to fix the race bug
- and the most code will be in the ceph layer

Xiubo Li (2):
ceph: add ceph_lock_info support for file_lock
ceph: use a xarray to record all the opened files for each inode

fs/ceph/file.c | 9 +++++++++
fs/ceph/inode.c | 4 ++++
fs/ceph/locks.c | 35 +++++++++++++++++++++++++++++----
fs/ceph/super.h | 4 ++++
include/linux/ceph/ceph_fs_fl.h | 26 ++++++++++++++++++++++++
include/linux/fs.h | 2 ++
6 files changed, 76 insertions(+), 4 deletions(-)
create mode 100644 include/linux/ceph/ceph_fs_fl.h

--
2.31.1



2022-11-14 05:41:45

by Xiubo Li

[permalink] [raw]
Subject: [PATCH 2/2 v2] ceph: use a xarray to record all the opened files for each inode

From: Xiubo Li <[email protected]>

When releasing the file locks the fl->fl_file memory could be
already released by another thread in filp_close(), so we couldn't
depend on fl->fl_file to get the inode. Just use a xarray to record
the opened files for each inode.

Cc: [email protected]
URL: https://tracker.ceph.com/issues/57986
Signed-off-by: Xiubo Li <[email protected]>
---
fs/ceph/file.c | 9 +++++++++
fs/ceph/inode.c | 4 ++++
fs/ceph/locks.c | 17 ++++++++++++++++-
fs/ceph/super.h | 4 ++++
4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 85afcbbb5648..cb4a9c52df27 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -231,6 +231,13 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
fi->flags |= CEPH_F_SYNC;

file->private_data = fi;
+
+ ret = xa_insert(&ci->i_opened_files, (unsigned long)file,
+ CEPH_FILP_AVAILABLE, GFP_KERNEL);
+ if (ret) {
+ kmem_cache_free(ceph_file_cachep, fi);
+ return ret;
+ }
}

ceph_get_fmode(ci, fmode, 1);
@@ -932,6 +939,8 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p regular file %p\n", inode, file);
WARN_ON(!list_empty(&fi->rw_contexts));

+ xa_erase(&ci->i_opened_files, (unsigned long)file);
+
ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
ceph_put_fmode(ci, fi->fmode, 1);

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 77b0cd9af370..554450838e44 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -619,6 +619,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_unsafe_iops);
spin_lock_init(&ci->i_unsafe_lock);

+ xa_init(&ci->i_opened_files);
+
ci->i_snap_realm = NULL;
INIT_LIST_HEAD(&ci->i_snap_realm_item);
INIT_LIST_HEAD(&ci->i_snap_flush_item);
@@ -637,6 +639,8 @@ void ceph_free_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);

+ xa_destroy(&ci->i_opened_files);
+
kfree(ci->i_symlink);
#ifdef CONFIG_FS_ENCRYPTION
kfree(ci->fscrypt_auth);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index d8385dd0076e..a176a30badd0 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -42,9 +42,10 @@ static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)

static void ceph_fl_release_lock(struct file_lock *fl)
{
- struct ceph_file_info *fi = fl->fl_file->private_data;
struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
struct ceph_inode_info *ci;
+ struct ceph_file_info *fi;
+ void *val;

/*
* If inode is NULL it should be a request file_lock,
@@ -54,6 +55,20 @@ static void ceph_fl_release_lock(struct file_lock *fl)
return;

ci = ceph_inode(inode);
+
+ /*
+ * For Posix-style locks, it may race between filp_close()s,
+ * and it's possible that the 'file' memory pointed by
+ * 'fl->fl_file' has been released. If so just skip it.
+ */
+ rcu_read_lock();
+ val = xa_load(&ci->i_opened_files, (unsigned long)fl->fl_file);
+ if (val == CEPH_FILP_AVAILABLE) {
+ fi = fl->fl_file->private_data;
+ atomic_dec(&fi->num_locks);
+ }
+ rcu_read_unlock();
+
if (atomic_dec_and_test(&ci->i_filelock_ref)) {
/* clear error when all locks are released */
spin_lock(&ci->i_ceph_lock);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7b75a84ba48d..b3e89192cbec 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -329,6 +329,8 @@ struct ceph_inode_xattrs_info {
u64 version, index_version;
};

+#define CEPH_FILP_AVAILABLE xa_mk_value(1)
+
/*
* Ceph inode.
*/
@@ -434,6 +436,8 @@ struct ceph_inode_info {
struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;

+ struct xarray i_opened_files;
+
union {
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
--
2.31.1


2022-11-14 05:46:00

by Xiubo Li

[permalink] [raw]
Subject: [PATCH 1/2 v2] ceph: add ceph_lock_info support for file_lock

From: Xiubo Li <[email protected]>

When ceph releasing the file_lock it will try to get the inode pointer
from the fl->fl_file, which the memory could already be released by
another thread in filp_close(). Because in VFS layer the fl->fl_file
doesn't increase the file's reference counter.

Will switch to use ceph dedicate lock info to track the inode.

And in ceph_fl_release_lock() we should skip all the operations if
the fl->fl_u.ceph_fl.fl_inode is not set, which should come from
the request file_lock. And we will set fl->fl_u.ceph_fl.fl_inode when
inserting it to the inode lock list, which is when copying the lock.

Cc: [email protected]
URL: https://tracker.ceph.com/issues/57986
Signed-off-by: Xiubo Li <[email protected]>
---
fs/ceph/locks.c | 18 +++++++++++++++---
include/linux/ceph/ceph_fs_fl.h | 26 ++++++++++++++++++++++++++
include/linux/fs.h | 2 ++
3 files changed, 43 insertions(+), 3 deletions(-)
create mode 100644 include/linux/ceph/ceph_fs_fl.h

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 3e2843e86e27..d8385dd0076e 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -34,22 +34,34 @@ static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
{
struct ceph_file_info *fi = dst->fl_file->private_data;
struct inode *inode = file_inode(dst->fl_file);
+
atomic_inc(&ceph_inode(inode)->i_filelock_ref);
atomic_inc(&fi->num_locks);
+ dst->fl_u.ceph_fl.fl_inode = igrab(inode);
}

static void ceph_fl_release_lock(struct file_lock *fl)
{
struct ceph_file_info *fi = fl->fl_file->private_data;
- struct inode *inode = file_inode(fl->fl_file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- atomic_dec(&fi->num_locks);
+ struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
+ struct ceph_inode_info *ci;
+
+ /*
+ * If inode is NULL it should be a request file_lock,
+ * nothing we can do.
+ */
+ if (!inode)
+ return;
+
+ ci = ceph_inode(inode);
if (atomic_dec_and_test(&ci->i_filelock_ref)) {
/* clear error when all locks are released */
spin_lock(&ci->i_ceph_lock);
ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
spin_unlock(&ci->i_ceph_lock);
}
+ iput(inode);
+ atomic_dec(&fi->num_locks);
}

static const struct file_lock_operations ceph_fl_lock_ops = {
diff --git a/include/linux/ceph/ceph_fs_fl.h b/include/linux/ceph/ceph_fs_fl.h
new file mode 100644
index 000000000000..02a412b26095
--- /dev/null
+++ b/include/linux/ceph/ceph_fs_fl.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+
+#ifndef CEPH_FS_FL_H
+#define CEPH_FS_FL_H
+
+#include <linux/fs.h>
+
+/*
+ * Ceph lock info
+ */
+
+struct ceph_lock_info {
+ struct inode *fl_inode;
+};
+
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e654435f1651..db4810d19e26 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1066,6 +1066,7 @@ bool opens_in_grace(struct net *);

/* that will die - we need it for nfs_lock_info */
#include <linux/nfs_fs_i.h>
+#include <linux/ceph/ceph_fs_fl.h>

/*
* struct file_lock represents a generic "file lock". It's used to represent
@@ -1119,6 +1120,7 @@ struct file_lock {
int state; /* state of grant or error if -ve */
unsigned int debug_id;
} afs;
+ struct ceph_lock_info ceph_fl;
} fl_u;
} __randomize_layout;

--
2.31.1


2022-11-14 09:44:14

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH 2/2 v2] ceph: use a xarray to record all the opened files for each inode

Hi,

I love your patch! Perhaps something to improve:

[auto build test WARNING on ceph-client/testing]
[also build test WARNING on ceph-client/for-linus linus/master v6.1-rc5 next-20221111]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/xiubli-redhat-com/ceph-fix-the-use-after-free-bug-for-file_lock/20221114-132233
base: https://github.com/ceph/ceph-client.git testing
patch link: https://lore.kernel.org/r/20221114051901.15371-3-xiubli%40redhat.com
patch subject: [PATCH 2/2 v2] ceph: use a xarray to record all the opened files for each inode
config: hexagon-randconfig-r041-20221114
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 463da45892e2d2a262277b91b96f5f8c05dc25d0)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/232cc8f1dbeddb308946202a7c67ee4d20451ae7
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review xiubli-redhat-com/ceph-fix-the-use-after-free-bug-for-file_lock/20221114-132233
git checkout 232cc8f1dbeddb308946202a7c67ee4d20451ae7
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash fs/ceph/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

In file included from fs/ceph/locks.c:8:
In file included from fs/ceph/super.h:8:
In file included from include/linux/backing-dev.h:16:
In file included from include/linux/writeback.h:13:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:12:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:334:
include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __raw_readb(PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
~~~~~~~~~~ ^
include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
#define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
^
In file included from fs/ceph/locks.c:8:
In file included from fs/ceph/super.h:8:
In file included from include/linux/backing-dev.h:16:
In file included from include/linux/writeback.h:13:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:12:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:334:
include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
~~~~~~~~~~ ^
include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
#define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
^
In file included from fs/ceph/locks.c:8:
In file included from fs/ceph/super.h:8:
In file included from include/linux/backing-dev.h:16:
In file included from include/linux/writeback.h:13:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:12:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:334:
include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writeb(value, PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
~~~~~~~~~~ ^
include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
__raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
~~~~~~~~~~ ^
>> fs/ceph/locks.c:66:6: warning: variable 'fi' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
if (val == CEPH_FILP_AVAILABLE) {
^~~~~~~~~~~~~~~~~~~~~~~~~~
fs/ceph/locks.c:79:14: note: uninitialized use occurs here
atomic_dec(&fi->num_locks);
^~
fs/ceph/locks.c:66:2: note: remove the 'if' if its condition is always true
if (val == CEPH_FILP_AVAILABLE) {
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
fs/ceph/locks.c:47:27: note: initialize the variable 'fi' to silence this warning
struct ceph_file_info *fi;
^
= NULL
7 warnings generated.


vim +66 fs/ceph/locks.c

42
43 static void ceph_fl_release_lock(struct file_lock *fl)
44 {
45 struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
46 struct ceph_inode_info *ci;
47 struct ceph_file_info *fi;
48 void *val;
49
50 /*
51 * If inode is NULL it should be a request file_lock,
52 * nothing we can do.
53 */
54 if (!inode)
55 return;
56
57 ci = ceph_inode(inode);
58
59 /*
60 * For Posix-style locks, it may race between filp_close()s,
61 * and it's possible that the 'file' memory pointed by
62 * 'fl->fl_file' has been released. If so just skip it.
63 */
64 rcu_read_lock();
65 val = xa_load(&ci->i_opened_files, (unsigned long)fl->fl_file);
> 66 if (val == CEPH_FILP_AVAILABLE) {
67 fi = fl->fl_file->private_data;
68 atomic_dec(&fi->num_locks);
69 }
70 rcu_read_unlock();
71
72 if (atomic_dec_and_test(&ci->i_filelock_ref)) {
73 /* clear error when all locks are released */
74 spin_lock(&ci->i_ceph_lock);
75 ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
76 spin_unlock(&ci->i_ceph_lock);
77 }
78 iput(inode);
79 atomic_dec(&fi->num_locks);
80 }
81

--
0-DAY CI Kernel Test Service
https://01.org/lkp


Attachments:
(No filename) (8.01 kB)
config (119.61 kB)
Download all attachments

2022-11-14 11:45:45

by Jeffrey Layton

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] ceph: add ceph_lock_info support for file_lock

On Mon, 2022-11-14 at 13:19 +0800, [email protected] wrote:
> From: Xiubo Li <[email protected]>
>
> When ceph releasing the file_lock it will try to get the inode pointer
> from the fl->fl_file, which the memory could already be released by
> another thread in filp_close(). Because in VFS layer the fl->fl_file
> doesn't increase the file's reference counter.
>
> Will switch to use ceph dedicate lock info to track the inode.
>
> And in ceph_fl_release_lock() we should skip all the operations if
> the fl->fl_u.ceph_fl.fl_inode is not set, which should come from
> the request file_lock. And we will set fl->fl_u.ceph_fl.fl_inode when
> inserting it to the inode lock list, which is when copying the lock.
>
> Cc: [email protected]
> URL: https://tracker.ceph.com/issues/57986
> Signed-off-by: Xiubo Li <[email protected]>
> ---
> fs/ceph/locks.c | 18 +++++++++++++++---
> include/linux/ceph/ceph_fs_fl.h | 26 ++++++++++++++++++++++++++
> include/linux/fs.h | 2 ++
> 3 files changed, 43 insertions(+), 3 deletions(-)
> create mode 100644 include/linux/ceph/ceph_fs_fl.h
>
> diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
> index 3e2843e86e27..d8385dd0076e 100644
> --- a/fs/ceph/locks.c
> +++ b/fs/ceph/locks.c
> @@ -34,22 +34,34 @@ static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
> {
> struct ceph_file_info *fi = dst->fl_file->private_data;
> struct inode *inode = file_inode(dst->fl_file);
> +
> atomic_inc(&ceph_inode(inode)->i_filelock_ref);
> atomic_inc(&fi->num_locks);
> + dst->fl_u.ceph_fl.fl_inode = igrab(inode);
> }
>
> static void ceph_fl_release_lock(struct file_lock *fl)
> {
> struct ceph_file_info *fi = fl->fl_file->private_data;
> - struct inode *inode = file_inode(fl->fl_file);
> - struct ceph_inode_info *ci = ceph_inode(inode);
> - atomic_dec(&fi->num_locks);
> + struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
> + struct ceph_inode_info *ci;
> +
> + /*
> + * If inode is NULL it should be a request file_lock,
> + * nothing we can do.
> + */
> + if (!inode)
> + return;
> +
> + ci = ceph_inode(inode);
> if (atomic_dec_and_test(&ci->i_filelock_ref)) {
> /* clear error when all locks are released */
> spin_lock(&ci->i_ceph_lock);
> ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
> spin_unlock(&ci->i_ceph_lock);
> }
> + iput(inode);
> + atomic_dec(&fi->num_locks);

It doesn't look like this fixes the original issue. "fi" may be pointing
to freed memory at this point, right? I think you may need to get rid of
the "num_locks" field in ceph_file_info, and do that in a different way?

> }
>
> static const struct file_lock_operations ceph_fl_lock_ops = {
> diff --git a/include/linux/ceph/ceph_fs_fl.h b/include/linux/ceph/ceph_fs_fl.h
> new file mode 100644
> index 000000000000..02a412b26095
> --- /dev/null
> +++ b/include/linux/ceph/ceph_fs_fl.h
> @@ -0,0 +1,26 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * ceph_fs.h - Ceph constants and data types to share between kernel and
> + * user space.
> + *
> + * Most types in this file are defined as little-endian, and are
> + * primarily intended to describe data structures that pass over the
> + * wire or that are stored on disk.
> + *
> + * LGPL2
> + */
> +
> +#ifndef CEPH_FS_FL_H
> +#define CEPH_FS_FL_H
> +
> +#include <linux/fs.h>
> +
> +/*
> + * Ceph lock info
> + */
> +
> +struct ceph_lock_info {
> + struct inode *fl_inode;
> +};
> +
> +#endif
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e654435f1651..db4810d19e26 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1066,6 +1066,7 @@ bool opens_in_grace(struct net *);
>
> /* that will die - we need it for nfs_lock_info */
> #include <linux/nfs_fs_i.h>
> +#include <linux/ceph/ceph_fs_fl.h>
>
> /*
> * struct file_lock represents a generic "file lock". It's used to represent
> @@ -1119,6 +1120,7 @@ struct file_lock {
> int state; /* state of grant or error if -ve */
> unsigned int debug_id;
> } afs;
> + struct ceph_lock_info ceph_fl;
> } fl_u;
> } __randomize_layout;
>

--
Jeff Layton <[email protected]>

2022-11-14 13:27:32

by Xiubo Li

[permalink] [raw]
Subject: Re: [PATCH 1/2 v2] ceph: add ceph_lock_info support for file_lock


On 14/11/2022 19:24, Jeff Layton wrote:
> On Mon, 2022-11-14 at 13:19 +0800, [email protected] wrote:
>> From: Xiubo Li <[email protected]>
>>
>> When ceph releasing the file_lock it will try to get the inode pointer
>> from the fl->fl_file, which the memory could already be released by
>> another thread in filp_close(). Because in VFS layer the fl->fl_file
>> doesn't increase the file's reference counter.
>>
>> Will switch to use ceph dedicate lock info to track the inode.
>>
>> And in ceph_fl_release_lock() we should skip all the operations if
>> the fl->fl_u.ceph_fl.fl_inode is not set, which should come from
>> the request file_lock. And we will set fl->fl_u.ceph_fl.fl_inode when
>> inserting it to the inode lock list, which is when copying the lock.
>>
>> Cc: [email protected]
>> URL: https://tracker.ceph.com/issues/57986
>> Signed-off-by: Xiubo Li <[email protected]>
>> ---
>> fs/ceph/locks.c | 18 +++++++++++++++---
>> include/linux/ceph/ceph_fs_fl.h | 26 ++++++++++++++++++++++++++
>> include/linux/fs.h | 2 ++
>> 3 files changed, 43 insertions(+), 3 deletions(-)
>> create mode 100644 include/linux/ceph/ceph_fs_fl.h
>>
>> diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
>> index 3e2843e86e27..d8385dd0076e 100644
>> --- a/fs/ceph/locks.c
>> +++ b/fs/ceph/locks.c
>> @@ -34,22 +34,34 @@ static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
>> {
>> struct ceph_file_info *fi = dst->fl_file->private_data;
>> struct inode *inode = file_inode(dst->fl_file);
>> +
>> atomic_inc(&ceph_inode(inode)->i_filelock_ref);
>> atomic_inc(&fi->num_locks);
>> + dst->fl_u.ceph_fl.fl_inode = igrab(inode);
>> }
>>
>> static void ceph_fl_release_lock(struct file_lock *fl)
>> {
>> struct ceph_file_info *fi = fl->fl_file->private_data;
>> - struct inode *inode = file_inode(fl->fl_file);
>> - struct ceph_inode_info *ci = ceph_inode(inode);
>> - atomic_dec(&fi->num_locks);
>> + struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
>> + struct ceph_inode_info *ci;
>> +
>> + /*
>> + * If inode is NULL it should be a request file_lock,
>> + * nothing we can do.
>> + */
>> + if (!inode)
>> + return;
>> +
>> + ci = ceph_inode(inode);
>> if (atomic_dec_and_test(&ci->i_filelock_ref)) {
>> /* clear error when all locks are released */
>> spin_lock(&ci->i_ceph_lock);
>> ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
>> spin_unlock(&ci->i_ceph_lock);
>> }
>> + iput(inode);
>> + atomic_dec(&fi->num_locks);
> It doesn't look like this fixes the original issue. "fi" may be pointing
> to freed memory at this point, right?

Yeah, I didn't fix this in the this patch. I fixed it in a dedicated 2/2
patch.

> I think you may need to get rid of
> the "num_locks" field in ceph_file_info, and do that in a different way?
>
This is a dedicated field for each 'file' struct. I have no idea how to
fix this in a different way yet.

Thanks!

- Xiubo


>> }
>>
>> static const struct file_lock_operations ceph_fl_lock_ops = {
>> diff --git a/include/linux/ceph/ceph_fs_fl.h b/include/linux/ceph/ceph_fs_fl.h
>> new file mode 100644
>> index 000000000000..02a412b26095
>> --- /dev/null
>> +++ b/include/linux/ceph/ceph_fs_fl.h
>> @@ -0,0 +1,26 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * ceph_fs.h - Ceph constants and data types to share between kernel and
>> + * user space.
>> + *
>> + * Most types in this file are defined as little-endian, and are
>> + * primarily intended to describe data structures that pass over the
>> + * wire or that are stored on disk.
>> + *
>> + * LGPL2
>> + */
>> +
>> +#ifndef CEPH_FS_FL_H
>> +#define CEPH_FS_FL_H
>> +
>> +#include <linux/fs.h>
>> +
>> +/*
>> + * Ceph lock info
>> + */
>> +
>> +struct ceph_lock_info {
>> + struct inode *fl_inode;
>> +};
>> +
>> +#endif
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index e654435f1651..db4810d19e26 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -1066,6 +1066,7 @@ bool opens_in_grace(struct net *);
>>
>> /* that will die - we need it for nfs_lock_info */
>> #include <linux/nfs_fs_i.h>
>> +#include <linux/ceph/ceph_fs_fl.h>
>>
>> /*
>> * struct file_lock represents a generic "file lock". It's used to represent
>> @@ -1119,6 +1120,7 @@ struct file_lock {
>> int state; /* state of grant or error if -ve */
>> unsigned int debug_id;
>> } afs;
>> + struct ceph_lock_info ceph_fl;
>> } fl_u;
>> } __randomize_layout;
>>


2022-11-14 13:32:36

by Xiubo Li

[permalink] [raw]
Subject: Re: [PATCH 2/2 v2] ceph: use a xarray to record all the opened files for each inode

Hi

Thanks for reporting this.

I will fix it in the next version.

- Xiubo

On 14/11/2022 16:54, kernel test robot wrote:
> Hi,
>
> I love your patch! Perhaps something to improve:
>
> [auto build test WARNING on ceph-client/testing]
> [also build test WARNING on ceph-client/for-linus linus/master v6.1-rc5 next-20221111]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url: https://github.com/intel-lab-lkp/linux/commits/xiubli-redhat-com/ceph-fix-the-use-after-free-bug-for-file_lock/20221114-132233
> base: https://github.com/ceph/ceph-client.git testing
> patch link: https://lore.kernel.org/r/20221114051901.15371-3-xiubli%40redhat.com
> patch subject: [PATCH 2/2 v2] ceph: use a xarray to record all the opened files for each inode
> config: hexagon-randconfig-r041-20221114
> compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 463da45892e2d2a262277b91b96f5f8c05dc25d0)
> reproduce (this is a W=1 build):
> wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
> chmod +x ~/bin/make.cross
> # https://github.com/intel-lab-lkp/linux/commit/232cc8f1dbeddb308946202a7c67ee4d20451ae7
> git remote add linux-review https://github.com/intel-lab-lkp/linux
> git fetch --no-tags linux-review xiubli-redhat-com/ceph-fix-the-use-after-free-bug-for-file_lock/20221114-132233
> git checkout 232cc8f1dbeddb308946202a7c67ee4d20451ae7
> # save the config file
> mkdir build_dir && cp config build_dir/.config
> COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash fs/ceph/
>
> If you fix the issue, kindly add following tag where applicable
> | Reported-by: kernel test robot <[email protected]>
>
> All warnings (new ones prefixed by >>):
>
> In file included from fs/ceph/locks.c:8:
> In file included from fs/ceph/super.h:8:
> In file included from include/linux/backing-dev.h:16:
> In file included from include/linux/writeback.h:13:
> In file included from include/linux/blk_types.h:10:
> In file included from include/linux/bvec.h:10:
> In file included from include/linux/highmem.h:12:
> In file included from include/linux/hardirq.h:11:
> In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
> In file included from include/asm-generic/hardirq.h:17:
> In file included from include/linux/irq.h:20:
> In file included from include/linux/io.h:13:
> In file included from arch/hexagon/include/asm/io.h:334:
> include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
> val = __raw_readb(PCI_IOBASE + addr);
> ~~~~~~~~~~ ^
> include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
> val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
> ~~~~~~~~~~ ^
> include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
> #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
> ^
> In file included from fs/ceph/locks.c:8:
> In file included from fs/ceph/super.h:8:
> In file included from include/linux/backing-dev.h:16:
> In file included from include/linux/writeback.h:13:
> In file included from include/linux/blk_types.h:10:
> In file included from include/linux/bvec.h:10:
> In file included from include/linux/highmem.h:12:
> In file included from include/linux/hardirq.h:11:
> In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
> In file included from include/asm-generic/hardirq.h:17:
> In file included from include/linux/irq.h:20:
> In file included from include/linux/io.h:13:
> In file included from arch/hexagon/include/asm/io.h:334:
> include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
> val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
> ~~~~~~~~~~ ^
> include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
> #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
> ^
> In file included from fs/ceph/locks.c:8:
> In file included from fs/ceph/super.h:8:
> In file included from include/linux/backing-dev.h:16:
> In file included from include/linux/writeback.h:13:
> In file included from include/linux/blk_types.h:10:
> In file included from include/linux/bvec.h:10:
> In file included from include/linux/highmem.h:12:
> In file included from include/linux/hardirq.h:11:
> In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
> In file included from include/asm-generic/hardirq.h:17:
> In file included from include/linux/irq.h:20:
> In file included from include/linux/io.h:13:
> In file included from arch/hexagon/include/asm/io.h:334:
> include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
> __raw_writeb(value, PCI_IOBASE + addr);
> ~~~~~~~~~~ ^
> include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
> __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
> ~~~~~~~~~~ ^
> include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
> __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
> ~~~~~~~~~~ ^
>>> fs/ceph/locks.c:66:6: warning: variable 'fi' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
> if (val == CEPH_FILP_AVAILABLE) {
> ^~~~~~~~~~~~~~~~~~~~~~~~~~
> fs/ceph/locks.c:79:14: note: uninitialized use occurs here
> atomic_dec(&fi->num_locks);
> ^~
> fs/ceph/locks.c:66:2: note: remove the 'if' if its condition is always true
> if (val == CEPH_FILP_AVAILABLE) {
> ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> fs/ceph/locks.c:47:27: note: initialize the variable 'fi' to silence this warning
> struct ceph_file_info *fi;
> ^
> = NULL
> 7 warnings generated.
>
>
> vim +66 fs/ceph/locks.c
>
> 42
> 43 static void ceph_fl_release_lock(struct file_lock *fl)
> 44 {
> 45 struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
> 46 struct ceph_inode_info *ci;
> 47 struct ceph_file_info *fi;
> 48 void *val;
> 49
> 50 /*
> 51 * If inode is NULL it should be a request file_lock,
> 52 * nothing we can do.
> 53 */
> 54 if (!inode)
> 55 return;
> 56
> 57 ci = ceph_inode(inode);
> 58
> 59 /*
> 60 * For Posix-style locks, it may race between filp_close()s,
> 61 * and it's possible that the 'file' memory pointed by
> 62 * 'fl->fl_file' has been released. If so just skip it.
> 63 */
> 64 rcu_read_lock();
> 65 val = xa_load(&ci->i_opened_files, (unsigned long)fl->fl_file);
> > 66 if (val == CEPH_FILP_AVAILABLE) {
> 67 fi = fl->fl_file->private_data;
> 68 atomic_dec(&fi->num_locks);
> 69 }
> 70 rcu_read_unlock();
> 71
> 72 if (atomic_dec_and_test(&ci->i_filelock_ref)) {
> 73 /* clear error when all locks are released */
> 74 spin_lock(&ci->i_ceph_lock);
> 75 ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
> 76 spin_unlock(&ci->i_ceph_lock);
> 77 }
> 78 iput(inode);
> 79 atomic_dec(&fi->num_locks);
> 80 }
> 81
>


2022-11-14 14:25:54

by Xiubo Li

[permalink] [raw]
Subject: Re: [PATCH 2/2 v2] ceph: use a xarray to record all the opened files for each inode


On 14/11/2022 21:39, Jeff Layton wrote:
> On Mon, 2022-11-14 at 13:19 +0800, [email protected] wrote:
>> From: Xiubo Li <[email protected]>
>>
>> When releasing the file locks the fl->fl_file memory could be
>> already released by another thread in filp_close(), so we couldn't
>> depend on fl->fl_file to get the inode. Just use a xarray to record
>> the opened files for each inode.
>>
>> Cc: [email protected]
>> URL: https://tracker.ceph.com/issues/57986
>> Signed-off-by: Xiubo Li <[email protected]>
>> ---
>> fs/ceph/file.c | 9 +++++++++
>> fs/ceph/inode.c | 4 ++++
>> fs/ceph/locks.c | 17 ++++++++++++++++-
>> fs/ceph/super.h | 4 ++++
>> 4 files changed, 33 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>> index 85afcbbb5648..cb4a9c52df27 100644
>> --- a/fs/ceph/file.c
>> +++ b/fs/ceph/file.c
>> @@ -231,6 +231,13 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
>> fi->flags |= CEPH_F_SYNC;
>>
>> file->private_data = fi;
>> +
>> + ret = xa_insert(&ci->i_opened_files, (unsigned long)file,
>> + CEPH_FILP_AVAILABLE, GFP_KERNEL);
>> + if (ret) {
>> + kmem_cache_free(ceph_file_cachep, fi);
>> + return ret;
>> + }
>> }
>>
>> ceph_get_fmode(ci, fmode, 1);
>> @@ -932,6 +939,8 @@ int ceph_release(struct inode *inode, struct file *file)
>> dout("release inode %p regular file %p\n", inode, file);
>> WARN_ON(!list_empty(&fi->rw_contexts));
>>
>> + xa_erase(&ci->i_opened_files, (unsigned long)file);
>> +
>> ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
>> ceph_put_fmode(ci, fi->fmode, 1);
>>
>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>> index 77b0cd9af370..554450838e44 100644
>> --- a/fs/ceph/inode.c
>> +++ b/fs/ceph/inode.c
>> @@ -619,6 +619,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>> INIT_LIST_HEAD(&ci->i_unsafe_iops);
>> spin_lock_init(&ci->i_unsafe_lock);
>>
>> + xa_init(&ci->i_opened_files);
>> +
>> ci->i_snap_realm = NULL;
>> INIT_LIST_HEAD(&ci->i_snap_realm_item);
>> INIT_LIST_HEAD(&ci->i_snap_flush_item);
>> @@ -637,6 +639,8 @@ void ceph_free_inode(struct inode *inode)
>> {
>> struct ceph_inode_info *ci = ceph_inode(inode);
>>
>> + xa_destroy(&ci->i_opened_files);
>> +
>> kfree(ci->i_symlink);
>> #ifdef CONFIG_FS_ENCRYPTION
>> kfree(ci->fscrypt_auth);
>> diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
>> index d8385dd0076e..a176a30badd0 100644
>> --- a/fs/ceph/locks.c
>> +++ b/fs/ceph/locks.c
>> @@ -42,9 +42,10 @@ static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
>>
>> static void ceph_fl_release_lock(struct file_lock *fl)
>> {
>> - struct ceph_file_info *fi = fl->fl_file->private_data;
>> struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
>> struct ceph_inode_info *ci;
>> + struct ceph_file_info *fi;
>> + void *val;
>>
>> /*
>> * If inode is NULL it should be a request file_lock,
>> @@ -54,6 +55,20 @@ static void ceph_fl_release_lock(struct file_lock *fl)
>> return;
>>
>> ci = ceph_inode(inode);
>> +
>> + /*
>> + * For Posix-style locks, it may race between filp_close()s,
>> + * and it's possible that the 'file' memory pointed by
>> + * 'fl->fl_file' has been released. If so just skip it.
>> + */
>> + rcu_read_lock();
>> + val = xa_load(&ci->i_opened_files, (unsigned long)fl->fl_file);
>> + if (val == CEPH_FILP_AVAILABLE) {
>> + fi = fl->fl_file->private_data;
>> + atomic_dec(&fi->num_locks);
> Don't you need to remove the old atomic_dec from this function if you
> move it here?

Yeah, I thought I have removed that. Not sure why I added it back.

>
>> + }
>> + rcu_read_unlock();
>> +
>> if (atomic_dec_and_test(&ci->i_filelock_ref)) {
>> /* clear error when all locks are released */
>> spin_lock(&ci->i_ceph_lock);
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 7b75a84ba48d..b3e89192cbec 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -329,6 +329,8 @@ struct ceph_inode_xattrs_info {
>> u64 version, index_version;
>> };
>>
>> +#define CEPH_FILP_AVAILABLE xa_mk_value(1)
>> +
>> /*
>> * Ceph inode.
>> */
>> @@ -434,6 +436,8 @@ struct ceph_inode_info {
>> struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
>> spinlock_t i_unsafe_lock;
>>
>> + struct xarray i_opened_files;
>> +
>> union {
>> struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
>> struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
> This looks like it'll work, but it's a lot of extra work, having to
> track this extra xarray just on the off chance that one of these fd's
> might have file locks. The num_locks field is only checked in one place
> in ceph_get_caps.
>
> Here's what I'd recommend instead:
>
> Have ceph_get_caps look at the lists in inode->i_flctx and see whether
> any of its locks have an fl_file that matches the @filp argument in that
> function. Most inodes never get any file locks, so in most cases this
> will turn out to just be a NULL pointer check for i_flctx anyway.
>
> Then you can just remove the num_locks field and call the new helper
> from ceph_get_caps instead. I'll send along a proposed patch for the
> helper in a bit.

Sure, Thanks Jeff.

- Xiubo



2022-11-14 14:27:04

by Jeffrey Layton

[permalink] [raw]
Subject: Re: [PATCH 2/2 v2] ceph: use a xarray to record all the opened files for each inode

On Mon, 2022-11-14 at 13:19 +0800, [email protected] wrote:
> From: Xiubo Li <[email protected]>
>
> When releasing the file locks the fl->fl_file memory could be
> already released by another thread in filp_close(), so we couldn't
> depend on fl->fl_file to get the inode. Just use a xarray to record
> the opened files for each inode.
>
> Cc: [email protected]
> URL: https://tracker.ceph.com/issues/57986
> Signed-off-by: Xiubo Li <[email protected]>
> ---
> fs/ceph/file.c | 9 +++++++++
> fs/ceph/inode.c | 4 ++++
> fs/ceph/locks.c | 17 ++++++++++++++++-
> fs/ceph/super.h | 4 ++++
> 4 files changed, 33 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 85afcbbb5648..cb4a9c52df27 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -231,6 +231,13 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
> fi->flags |= CEPH_F_SYNC;
>
> file->private_data = fi;
> +
> + ret = xa_insert(&ci->i_opened_files, (unsigned long)file,
> + CEPH_FILP_AVAILABLE, GFP_KERNEL);
> + if (ret) {
> + kmem_cache_free(ceph_file_cachep, fi);
> + return ret;
> + }
> }
>
> ceph_get_fmode(ci, fmode, 1);
> @@ -932,6 +939,8 @@ int ceph_release(struct inode *inode, struct file *file)
> dout("release inode %p regular file %p\n", inode, file);
> WARN_ON(!list_empty(&fi->rw_contexts));
>
> + xa_erase(&ci->i_opened_files, (unsigned long)file);
> +
> ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
> ceph_put_fmode(ci, fi->fmode, 1);
>
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 77b0cd9af370..554450838e44 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -619,6 +619,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> INIT_LIST_HEAD(&ci->i_unsafe_iops);
> spin_lock_init(&ci->i_unsafe_lock);
>
> + xa_init(&ci->i_opened_files);
> +
> ci->i_snap_realm = NULL;
> INIT_LIST_HEAD(&ci->i_snap_realm_item);
> INIT_LIST_HEAD(&ci->i_snap_flush_item);
> @@ -637,6 +639,8 @@ void ceph_free_inode(struct inode *inode)
> {
> struct ceph_inode_info *ci = ceph_inode(inode);
>
> + xa_destroy(&ci->i_opened_files);
> +
> kfree(ci->i_symlink);
> #ifdef CONFIG_FS_ENCRYPTION
> kfree(ci->fscrypt_auth);
> diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
> index d8385dd0076e..a176a30badd0 100644
> --- a/fs/ceph/locks.c
> +++ b/fs/ceph/locks.c
> @@ -42,9 +42,10 @@ static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
>
> static void ceph_fl_release_lock(struct file_lock *fl)
> {
> - struct ceph_file_info *fi = fl->fl_file->private_data;
> struct inode *inode = fl->fl_u.ceph_fl.fl_inode;
> struct ceph_inode_info *ci;
> + struct ceph_file_info *fi;
> + void *val;
>
> /*
> * If inode is NULL it should be a request file_lock,
> @@ -54,6 +55,20 @@ static void ceph_fl_release_lock(struct file_lock *fl)
> return;
>
> ci = ceph_inode(inode);
> +
> + /*
> + * For Posix-style locks, it may race between filp_close()s,
> + * and it's possible that the 'file' memory pointed by
> + * 'fl->fl_file' has been released. If so just skip it.
> + */
> + rcu_read_lock();
> + val = xa_load(&ci->i_opened_files, (unsigned long)fl->fl_file);
> + if (val == CEPH_FILP_AVAILABLE) {
> + fi = fl->fl_file->private_data;
> + atomic_dec(&fi->num_locks);

Don't you need to remove the old atomic_dec from this function if you
move it here?

> + }
> + rcu_read_unlock();
> +
> if (atomic_dec_and_test(&ci->i_filelock_ref)) {
> /* clear error when all locks are released */
> spin_lock(&ci->i_ceph_lock);
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 7b75a84ba48d..b3e89192cbec 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -329,6 +329,8 @@ struct ceph_inode_xattrs_info {
> u64 version, index_version;
> };
>
> +#define CEPH_FILP_AVAILABLE xa_mk_value(1)
> +
> /*
> * Ceph inode.
> */
> @@ -434,6 +436,8 @@ struct ceph_inode_info {
> struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
> spinlock_t i_unsafe_lock;
>
> + struct xarray i_opened_files;
> +
> union {
> struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */

This looks like it'll work, but it's a lot of extra work, having to
track this extra xarray just on the off chance that one of these fd's
might have file locks. The num_locks field is only checked in one place
in ceph_get_caps.

Here's what I'd recommend instead:

Have ceph_get_caps look at the lists in inode->i_flctx and see whether
any of its locks have an fl_file that matches the @filp argument in that
function. Most inodes never get any file locks, so in most cases this
will turn out to just be a NULL pointer check for i_flctx anyway.

Then you can just remove the num_locks field and call the new helper
from ceph_get_caps instead. I'll send along a proposed patch for the
helper in a bit.
--
Jeff Layton <[email protected]>