Hi,
I'm sending three "sleeping function called from invalid context" bug
fixes that I had on my TODO for a while. All of them are ceph_buffer_put
related, and all the fixes follow the same pattern: delay the operation
until the ci->i_ceph_lock is released.
The first patch simply allows ceph_buffer_put to receive a NULL buffer so
that the NULL check doesn't need to be performed in all the other patches.
IOW, it's not really required, just convenient.
(Note: maybe these patches should all be tagged for stable.)
Luis Henriques (4):
libceph: allow ceph_buffer_put() to receive a NULL ceph_buffer
ceph: fix buffer free while holding i_ceph_lock in __ceph_setxattr()
ceph: fix buffer free while holding i_ceph_lock in
__ceph_build_xattrs_blob()
ceph: fix buffer free while holding i_ceph_lock in fill_inode()
fs/ceph/caps.c | 5 ++++-
fs/ceph/inode.c | 7 ++++---
fs/ceph/snap.c | 4 +++-
fs/ceph/super.h | 2 +-
fs/ceph/xattr.c | 19 ++++++++++++++-----
include/linux/ceph/buffer.h | 3 ++-
6 files changed, 28 insertions(+), 12 deletions(-)
Signed-off-by: Luis Henriques <[email protected]>
---
include/linux/ceph/buffer.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h
index 5e58bb29b1a3..11cdc7c60480 100644
--- a/include/linux/ceph/buffer.h
+++ b/include/linux/ceph/buffer.h
@@ -30,7 +30,8 @@ static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
static inline void ceph_buffer_put(struct ceph_buffer *b)
{
- kref_put(&b->kref, ceph_buffer_release);
+ if (b)
+ kref_put(&b->kref, ceph_buffer_release);
}
extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
Calling ceph_buffer_put() in __ceph_build_xattrs_blob() may result in
freeing the i_xattrs.blob buffer while holding the i_ceph_lock. This can
be fixed by having this function returning the old blob buffer and have
the callers of this function freeing it when the lock is released.
The following backtrace was triggered by fstests generic/117.
BUG: sleeping function called from invalid context at mm/vmalloc.c:2283
in_atomic(): 1, irqs_disabled(): 0, pid: 649, name: fsstress
4 locks held by fsstress/649:
#0: 00000000a7478e7e (&type->s_umount_key#19){++++}, at: iterate_supers+0x77/0xf0
#1: 00000000f8de1423 (&(&ci->i_ceph_lock)->rlock){+.+.}, at: ceph_check_caps+0x7b/0xc60
#2: 00000000562f2b27 (&s->s_mutex){+.+.}, at: ceph_check_caps+0x3bd/0xc60
#3: 00000000f83ce16a (&mdsc->snap_rwsem){++++}, at: ceph_check_caps+0x3ed/0xc60
CPU: 1 PID: 649 Comm: fsstress Not tainted 5.2.0+ #439
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x67/0x90
___might_sleep.cold+0x9f/0xb1
vfree+0x4b/0x60
ceph_buffer_release+0x1b/0x60
__ceph_build_xattrs_blob+0x12b/0x170
__send_cap+0x302/0x540
? __lock_acquire+0x23c/0x1e40
? __mark_caps_flushing+0x15c/0x280
? _raw_spin_unlock+0x24/0x30
ceph_check_caps+0x5f0/0xc60
ceph_flush_dirty_caps+0x7c/0x150
? __ia32_sys_fdatasync+0x20/0x20
ceph_sync_fs+0x5a/0x130
iterate_supers+0x8f/0xf0
ksys_sync+0x4f/0xb0
__ia32_sys_sync+0xa/0x10
do_syscall_64+0x50/0x1c0
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x7fc6409ab617
Signed-off-by: Luis Henriques <[email protected]>
---
fs/ceph/caps.c | 5 ++++-
fs/ceph/snap.c | 4 +++-
fs/ceph/super.h | 2 +-
fs/ceph/xattr.c | 11 ++++++++---
4 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d98dcd976c80..ce0f5658720a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1301,6 +1301,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
{
struct ceph_inode_info *ci = cap->ci;
struct inode *inode = &ci->vfs_inode;
+ struct ceph_buffer *old_blob = NULL;
struct cap_msg_args arg;
int held, revoking;
int wake = 0;
@@ -1365,7 +1366,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ci->i_requested_max_size = arg.max_size;
if (flushing & CEPH_CAP_XATTR_EXCL) {
- __ceph_build_xattrs_blob(ci);
+ old_blob = __ceph_build_xattrs_blob(ci);
arg.xattr_version = ci->i_xattrs.version;
arg.xattr_buf = ci->i_xattrs.blob;
} else {
@@ -1409,6 +1410,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
spin_unlock(&ci->i_ceph_lock);
+ ceph_buffer_put(old_blob);
+
ret = send_cap_msg(&arg);
if (ret < 0) {
dout("error sending cap msg, must requeue %p\n", inode);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4c6494eb02b5..ccfcc66aaf44 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -465,6 +465,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
struct inode *inode = &ci->vfs_inode;
struct ceph_cap_snap *capsnap;
struct ceph_snap_context *old_snapc, *new_snapc;
+ struct ceph_buffer *old_blob = NULL;
int used, dirty;
capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
@@ -541,7 +542,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
capsnap->gid = inode->i_gid;
if (dirty & CEPH_CAP_XATTR_EXCL) {
- __ceph_build_xattrs_blob(ci);
+ old_blob = __ceph_build_xattrs_blob(ci);
capsnap->xattr_blob =
ceph_buffer_get(ci->i_xattrs.blob);
capsnap->xattr_version = ci->i_xattrs.version;
@@ -584,6 +585,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
}
spin_unlock(&ci->i_ceph_lock);
+ ceph_buffer_put(old_blob);
kfree(capsnap);
ceph_put_snap_context(old_snapc);
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index d2352fd95dbc..6b9f1ee7de85 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -926,7 +926,7 @@ extern int ceph_getattr(const struct path *path, struct kstat *stat,
int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
-extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
extern const struct xattr_handler *ceph_xattr_handlers[];
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index c083557b3657..939eab7aa219 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -754,12 +754,15 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
/*
* If there are dirty xattrs, reencode xattrs into the prealloc_blob
- * and swap into place.
+ * and swap into place. It returns the old i_xattrs.blob (or NULL) so
+ * that it can be freed by the caller as the i_ceph_lock is likely to be
+ * held.
*/
-void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
{
struct rb_node *p;
struct ceph_inode_xattr *xattr = NULL;
+ struct ceph_buffer *old_blob = NULL;
void *dest;
dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
@@ -790,12 +793,14 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
+ old_blob = ci->i_xattrs.blob;
ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
ci->i_xattrs.prealloc_blob = NULL;
ci->i_xattrs.dirty = false;
ci->i_xattrs.version++;
}
+
+ return old_blob;
}
static inline int __get_request_mask(struct inode *in) {
Calling ceph_buffer_put() in fill_inode() may result in freeing the
i_xattrs.blob buffer while holding the i_ceph_lock. This can be fixed by
postponing the call until later, when the lock is released.
The following backtrace was triggered by fstests generic/070.
BUG: sleeping function called from invalid context at mm/vmalloc.c:2283
in_atomic(): 1, irqs_disabled(): 0, pid: 3852, name: kworker/0:4
6 locks held by kworker/0:4/3852:
#0: 000000004270f6bb ((wq_completion)ceph-msgr){+.+.}, at: process_one_work+0x1b8/0x5f0
#1: 00000000eb420803 ((work_completion)(&(&con->work)->work)){+.+.}, at: process_one_work+0x1b8/0x5f0
#2: 00000000be1c53a4 (&s->s_mutex){+.+.}, at: dispatch+0x288/0x1476
#3: 00000000559cb958 (&mdsc->snap_rwsem){++++}, at: dispatch+0x2eb/0x1476
#4: 000000000d5ebbae (&req->r_fill_mutex){+.+.}, at: dispatch+0x2fc/0x1476
#5: 00000000a83d0514 (&(&ci->i_ceph_lock)->rlock){+.+.}, at: fill_inode.isra.0+0xf8/0xf70
CPU: 0 PID: 3852 Comm: kworker/0:4 Not tainted 5.2.0+ #441
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-prebuilt.qemu.org 04/01/2014
Workqueue: ceph-msgr ceph_con_workfn
Call Trace:
dump_stack+0x67/0x90
___might_sleep.cold+0x9f/0xb1
vfree+0x4b/0x60
ceph_buffer_release+0x1b/0x60
fill_inode.isra.0+0xa9b/0xf70
ceph_fill_trace+0x13b/0xc70
? dispatch+0x2eb/0x1476
dispatch+0x320/0x1476
? __mutex_unlock_slowpath+0x4d/0x2a0
ceph_con_workfn+0xc97/0x2ec0
? process_one_work+0x1b8/0x5f0
process_one_work+0x244/0x5f0
worker_thread+0x4d/0x3e0
kthread+0x105/0x140
? process_one_work+0x5f0/0x5f0
? kthread_park+0x90/0x90
ret_from_fork+0x3a/0x50
Signed-off-by: Luis Henriques <[email protected]>
---
fs/ceph/inode.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 791f84a13bb8..18500edefc56 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -736,6 +736,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
int issued, new_issued, info_caps;
struct timespec64 mtime, atime, ctime;
struct ceph_buffer *xattr_blob = NULL;
+ struct ceph_buffer *old_blob = NULL;
struct ceph_string *pool_ns = NULL;
struct ceph_cap *new_cap = NULL;
int err = 0;
@@ -881,7 +882,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
if (ci->i_xattrs.blob)
- ceph_buffer_put(ci->i_xattrs.blob);
+ old_blob = ci->i_xattrs.blob;
ci->i_xattrs.blob = xattr_blob;
if (xattr_blob)
memcpy(ci->i_xattrs.blob->vec.iov_base,
@@ -1022,8 +1023,8 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
out:
if (new_cap)
ceph_put_cap(mdsc, new_cap);
- if (xattr_blob)
- ceph_buffer_put(xattr_blob);
+ ceph_buffer_put(old_blob);
+ ceph_buffer_put(xattr_blob);
ceph_put_string(pool_ns);
return err;
}
Calling ceph_buffer_put() in __ceph_setxattr() may end up freeing the
i_xattrs.prealloc_blob buffer while holding the i_ceph_lock. This can be
fixed by postponing the call until later, when the lock is released.
The following backtrace was triggered by fstests generic/117.
BUG: sleeping function called from invalid context at mm/vmalloc.c:2283
in_atomic(): 1, irqs_disabled(): 0, pid: 650, name: fsstress
3 locks held by fsstress/650:
#0: 00000000870a0fe8 (sb_writers#8){.+.+}, at: mnt_want_write+0x20/0x50
#1: 00000000ba0c4c74 (&type->i_mutex_dir_key#6){++++}, at: vfs_setxattr+0x55/0xa0
#2: 000000008dfbb3f2 (&(&ci->i_ceph_lock)->rlock){+.+.}, at: __ceph_setxattr+0x297/0x810
CPU: 1 PID: 650 Comm: fsstress Not tainted 5.2.0+ #437
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x67/0x90
___might_sleep.cold+0x9f/0xb1
vfree+0x4b/0x60
ceph_buffer_release+0x1b/0x60
__ceph_setxattr+0x2b4/0x810
__vfs_setxattr+0x66/0x80
__vfs_setxattr_noperm+0x59/0xf0
vfs_setxattr+0x81/0xa0
setxattr+0x115/0x230
? filename_lookup+0xc9/0x140
? rcu_read_lock_sched_held+0x74/0x80
? rcu_sync_lockdep_assert+0x2e/0x60
? __sb_start_write+0x142/0x1a0
? mnt_want_write+0x20/0x50
path_setxattr+0xba/0xd0
__x64_sys_lsetxattr+0x24/0x30
do_syscall_64+0x50/0x1c0
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x7ff23514359a
Signed-off-by: Luis Henriques <[email protected]>
---
fs/ceph/xattr.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 37b458a9af3a..c083557b3657 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1036,6 +1036,7 @@ int __ceph_setxattr(struct inode *inode, const char *name,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf = NULL;
+ struct ceph_buffer *old_blob = NULL;
int issued;
int err;
int dirty = 0;
@@ -1109,13 +1110,15 @@ int __ceph_setxattr(struct inode *inode, const char *name,
struct ceph_buffer *blob;
spin_unlock(&ci->i_ceph_lock);
- dout(" preaallocating new blob size=%d\n", required_blob_size);
+ ceph_buffer_put(old_blob); /* Shouldn't be required */
+ dout(" pre-allocating new blob size=%d\n", required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob)
goto do_sync_unlocked;
spin_lock(&ci->i_ceph_lock);
+ /* prealloc_blob can't be released while holding i_ceph_lock */
if (ci->i_xattrs.prealloc_blob)
- ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+ old_blob = ci->i_xattrs.prealloc_blob;
ci->i_xattrs.prealloc_blob = blob;
goto retry;
}
@@ -1131,6 +1134,7 @@ int __ceph_setxattr(struct inode *inode, const char *name,
}
spin_unlock(&ci->i_ceph_lock);
+ ceph_buffer_put(old_blob);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
if (dirty)
On Fri, 2019-07-19 at 15:32 +0100, Luis Henriques wrote:
> Hi,
>
> I'm sending three "sleeping function called from invalid context" bug
> fixes that I had on my TODO for a while. All of them are ceph_buffer_put
> related, and all the fixes follow the same pattern: delay the operation
> until the ci->i_ceph_lock is released.
>
> The first patch simply allows ceph_buffer_put to receive a NULL buffer so
> that the NULL check doesn't need to be performed in all the other patches.
> IOW, it's not really required, just convenient.
>
> (Note: maybe these patches should all be tagged for stable.)
>
> Luis Henriques (4):
> libceph: allow ceph_buffer_put() to receive a NULL ceph_buffer
> ceph: fix buffer free while holding i_ceph_lock in __ceph_setxattr()
> ceph: fix buffer free while holding i_ceph_lock in
> __ceph_build_xattrs_blob()
> ceph: fix buffer free while holding i_ceph_lock in fill_inode()
>
> fs/ceph/caps.c | 5 ++++-
> fs/ceph/inode.c | 7 ++++---
> fs/ceph/snap.c | 4 +++-
> fs/ceph/super.h | 2 +-
> fs/ceph/xattr.c | 19 ++++++++++++++-----
> include/linux/ceph/buffer.h | 3 ++-
> 6 files changed, 28 insertions(+), 12 deletions(-)
This all looks good to me. I'll plan to merge these into the testing
branch soon, and tag them for stable.
PS: On a related note (and more of a question for Ilya)...
I'm wondering if we get any benefit from having our own ceph_kvmalloc
routine. Why are we not better off using the stock kvmalloc routine
instead? Forcing a vmalloc just because we've gone above 32k allocation
doesn't seem like the right thing to do.
PPS: I also wonder if we ought to put a might_sleep() in kvfree(). I
think that kfree generally doesn't, and I wonder how many uses of this
end up using kfree until memory ends up fragmented.
--
Jeff Layton <[email protected]>
On Fri, 2019-07-19 at 15:32 +0100, Luis Henriques wrote:
> Calling ceph_buffer_put() in __ceph_setxattr() may end up freeing the
> i_xattrs.prealloc_blob buffer while holding the i_ceph_lock. This can be
> fixed by postponing the call until later, when the lock is released.
>
> The following backtrace was triggered by fstests generic/117.
>
> BUG: sleeping function called from invalid context at mm/vmalloc.c:2283
> in_atomic(): 1, irqs_disabled(): 0, pid: 650, name: fsstress
> 3 locks held by fsstress/650:
> #0: 00000000870a0fe8 (sb_writers#8){.+.+}, at: mnt_want_write+0x20/0x50
> #1: 00000000ba0c4c74 (&type->i_mutex_dir_key#6){++++}, at: vfs_setxattr+0x55/0xa0
> #2: 000000008dfbb3f2 (&(&ci->i_ceph_lock)->rlock){+.+.}, at: __ceph_setxattr+0x297/0x810
> CPU: 1 PID: 650 Comm: fsstress Not tainted 5.2.0+ #437
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-prebuilt.qemu.org 04/01/2014
> Call Trace:
> dump_stack+0x67/0x90
> ___might_sleep.cold+0x9f/0xb1
> vfree+0x4b/0x60
> ceph_buffer_release+0x1b/0x60
> __ceph_setxattr+0x2b4/0x810
> __vfs_setxattr+0x66/0x80
> __vfs_setxattr_noperm+0x59/0xf0
> vfs_setxattr+0x81/0xa0
> setxattr+0x115/0x230
> ? filename_lookup+0xc9/0x140
> ? rcu_read_lock_sched_held+0x74/0x80
> ? rcu_sync_lockdep_assert+0x2e/0x60
> ? __sb_start_write+0x142/0x1a0
> ? mnt_want_write+0x20/0x50
> path_setxattr+0xba/0xd0
> __x64_sys_lsetxattr+0x24/0x30
> do_syscall_64+0x50/0x1c0
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x7ff23514359a
>
> Signed-off-by: Luis Henriques <[email protected]>
> ---
> fs/ceph/xattr.c | 8 ++++++--
> 1 file changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> index 37b458a9af3a..c083557b3657 100644
> --- a/fs/ceph/xattr.c
> +++ b/fs/ceph/xattr.c
> @@ -1036,6 +1036,7 @@ int __ceph_setxattr(struct inode *inode, const char *name,
> struct ceph_inode_info *ci = ceph_inode(inode);
> struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
> struct ceph_cap_flush *prealloc_cf = NULL;
> + struct ceph_buffer *old_blob = NULL;
> int issued;
> int err;
> int dirty = 0;
> @@ -1109,13 +1110,15 @@ int __ceph_setxattr(struct inode *inode, const char *name,
> struct ceph_buffer *blob;
>
> spin_unlock(&ci->i_ceph_lock);
> - dout(" preaallocating new blob size=%d\n", required_blob_size);
> + ceph_buffer_put(old_blob); /* Shouldn't be required */
> + dout(" pre-allocating new blob size=%d\n", required_blob_size);
> blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
> if (!blob)
> goto do_sync_unlocked;
> spin_lock(&ci->i_ceph_lock);
> + /* prealloc_blob can't be released while holding i_ceph_lock */
> if (ci->i_xattrs.prealloc_blob)
> - ceph_buffer_put(ci->i_xattrs.prealloc_blob);
> + old_blob = ci->i_xattrs.prealloc_blob;
> ci->i_xattrs.prealloc_blob = blob;
> goto retry;
> }
> @@ -1131,6 +1134,7 @@ int __ceph_setxattr(struct inode *inode, const char *name,
> }
>
> spin_unlock(&ci->i_ceph_lock);
> + ceph_buffer_put(old_blob);
> if (lock_snap_rwsem)
> up_read(&mdsc->snap_rwsem);
> if (dirty)
(cc'ing Al)
Al pointed out on IRC that vfree should be callable under spinlock. It
only sleeps if !in_interrupt(), and I think that should return true if
we're holding a spinlock.
I'll plan to try replicating this soon.
--
Jeff Layton <[email protected]>
On Fri, Jul 19, 2019 at 07:07:49PM -0400, Jeff Layton wrote:
> Al pointed out on IRC that vfree should be callable under spinlock.
Al had been near-terminally low on caffeine at the time, posted
a retraction a few minutes later and went to grab some coffee...
> It
> only sleeps if !in_interrupt(), and I think that should return true if
> we're holding a spinlock.
It can be used from RCU callbacks and all such; it *can't* be used from
under spinlock - on non-preempt builds there's no way to recognize that.
On Sat, Jul 20, 2019 at 12:23:08AM +0100, Al Viro wrote:
> On Fri, Jul 19, 2019 at 07:07:49PM -0400, Jeff Layton wrote:
>
> > Al pointed out on IRC that vfree should be callable under spinlock.
>
> Al had been near-terminally low on caffeine at the time, posted
> a retraction a few minutes later and went to grab some coffee...
>
> > It
> > only sleeps if !in_interrupt(), and I think that should return true if
> > we're holding a spinlock.
>
> It can be used from RCU callbacks and all such; it *can't* be used from
> under spinlock - on non-preempt builds there's no way to recognize that.
Re original patch: looks like the sane way to handle that.
Alternatively, we could add kvfree_atomic() for use in such situations,
but I rather doubt that it's a good idea - not unless you need to free
something under a spinlock held over a large area, which is generally
a bad idea to start with...
Note that vfree_atomic() has only one caller in the entire tree,
BTW.
On Sat, 2019-07-20 at 00:30 +0100, Al Viro wrote:
> On Sat, Jul 20, 2019 at 12:23:08AM +0100, Al Viro wrote:
> > On Fri, Jul 19, 2019 at 07:07:49PM -0400, Jeff Layton wrote:
> >
> > > Al pointed out on IRC that vfree should be callable under spinlock.
> >
> > Al had been near-terminally low on caffeine at the time, posted
> > a retraction a few minutes later and went to grab some coffee...
> >
> > > It
> > > only sleeps if !in_interrupt(), and I think that should return true if
> > > we're holding a spinlock.
> >
> > It can be used from RCU callbacks and all such; it *can't* be used from
> > under spinlock - on non-preempt builds there's no way to recognize that.
>
> Re original patch: looks like the sane way to handle that.
> Alternatively, we could add kvfree_atomic() for use in such situations,
> but I rather doubt that it's a good idea - not unless you need to free
> something under a spinlock held over a large area, which is generally
> a bad idea to start with...
>
> Note that vfree_atomic() has only one caller in the entire tree,
> BTW.
In that case, I wonder if we ought to add this to the top of kvfree():
might_sleep_if(!in_interrupt());
Might there be other places that are calling it under spinlock that are
almost always going down the kfree() path?
--
Jeff Layton <[email protected]>
On Fri, Jul 19, 2019 at 5:20 PM Jeff Layton <[email protected]> wrote:
>
> On Fri, 2019-07-19 at 15:32 +0100, Luis Henriques wrote:
> > Hi,
> >
> > I'm sending three "sleeping function called from invalid context" bug
> > fixes that I had on my TODO for a while. All of them are ceph_buffer_put
> > related, and all the fixes follow the same pattern: delay the operation
> > until the ci->i_ceph_lock is released.
> >
> > The first patch simply allows ceph_buffer_put to receive a NULL buffer so
> > that the NULL check doesn't need to be performed in all the other patches.
> > IOW, it's not really required, just convenient.
> >
> > (Note: maybe these patches should all be tagged for stable.)
> >
> > Luis Henriques (4):
> > libceph: allow ceph_buffer_put() to receive a NULL ceph_buffer
> > ceph: fix buffer free while holding i_ceph_lock in __ceph_setxattr()
> > ceph: fix buffer free while holding i_ceph_lock in
> > __ceph_build_xattrs_blob()
> > ceph: fix buffer free while holding i_ceph_lock in fill_inode()
> >
> > fs/ceph/caps.c | 5 ++++-
> > fs/ceph/inode.c | 7 ++++---
> > fs/ceph/snap.c | 4 +++-
> > fs/ceph/super.h | 2 +-
> > fs/ceph/xattr.c | 19 ++++++++++++++-----
> > include/linux/ceph/buffer.h | 3 ++-
> > 6 files changed, 28 insertions(+), 12 deletions(-)
>
> This all looks good to me. I'll plan to merge these into the testing
> branch soon, and tag them for stable.
>
> PS: On a related note (and more of a question for Ilya)...
>
> I'm wondering if we get any benefit from having our own ceph_kvmalloc
> routine. Why are we not better off using the stock kvmalloc routine
> instead? Forcing a vmalloc just because we've gone above 32k allocation
> doesn't seem like the right thing to do.
I don't remember off the top of my head and can't check right now.
Could be that kvmalloc() didn't exist back then. I'll add that to my
TODO list.
Thanks,
Ilya