2022-07-11 03:49:08

by Ian Kent

[permalink] [raw]
Subject: [PATCH 0/3] autofs: fix may_umount_tree()

The function used by autofs to check if a tree of mounts may be umounted
doesn't work with mount namespaces.

Some time ago an attempt to fix it, appart from the implementation being
wrong, failed to take advantage of cases that allowed the check to
terminate early and so was too inefficient to be considered for merge.

This series utilizes cases for which the check can be terminated early
as best it can.

The patches in this series are prefixed with vfs becuase they are
changes to the VFS code but the only caller of the may_umount_tree()
function is the autofs file system.

For the interested here is a procedure that can be used to reproduce
the problem on a current kernel:

- Add this line to /etc/auto.master:
/- /etc/auto.test -t 5

- create the map /etc/auto.test as:
/test -fstype=tmpfs :tmpfs

- Enable debug logging in automount:

sed -i '/^#logging =/c logging = debug' /etc/autofs.conf
systemctl restart autofs.service

The autofs debug logging output can be observed in another terminal
using "journalctl -f".

Use the following script to run the two tests below.

$ cat /usr/local/bin/test.sh
#!/bin/sh
set -e
exec > >(logger --id=$$) 2>&1
echo Starting test
# Change to the /test directory to keep the mount active
cd /test
grep /test /proc/self/mountinfo
sleep 10
echo Ending test

1. Run the test script as root from the root mount namespace.
- observe that automount reports "expire_proc_direct: 1 remaining in /-"
until after the script exits.
- correct behaviour.

2. Run the test script as root from a new mount namespace by using:

# unshare -m --propagation unchanged test.sh

- Observe that automount reports "expiring path /test" before the
script has exited and tries to unmount /test.
This fails with ">> umount: /test: target is busy." until the script
exits.
- incorrect behaviour.

Signed-off-by: Ian Kent <[email protected]>
---

Ian Kent (3):
vfs: track count of child mounts
vfs: add propagate_mount_tree_busy() helper
vfs: make may_umount_tree() mount namespace aware


fs/autofs/expire.c | 14 ++++++++--
fs/mount.h | 1 +
fs/namespace.c | 40 +++++++++++++++++++---------
fs/pnode.c | 61 +++++++++++++++++++++++++++++++++++++++++++
fs/pnode.h | 1 +
include/linux/mount.h | 5 +++-
6 files changed, 107 insertions(+), 15 deletions(-)

--
Ian


2022-07-11 04:12:53

by Ian Kent

[permalink] [raw]
Subject: [PATCH 1/3] vfs: track count of child mounts

While the total reference count of a mount is mostly all that's needed
the reference count corresponding to the mounts only is occassionally
also needed (for example, autofs checking if a tree of mounts can be
expired).

To make this reference count avaialble with minimal changes add a
counter to track the number of child mounts under a given mount. This
count can then be used to calculate the mounts only reference count.

Signed-off-by: Ian Kent <[email protected]>
---
fs/mount.h | 1 +
fs/namespace.c | 8 ++++++++
2 files changed, 9 insertions(+)

diff --git a/fs/mount.h b/fs/mount.h
index 0b6e08cf8afb..3f0f62912463 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -52,6 +52,7 @@ struct mount {
int mnt_writers;
#endif
struct list_head mnt_mounts; /* list of children, anchored here */
+ unsigned int mnt_mounts_cnt; /* count of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
struct list_head mnt_instance; /* mount instance on sb->s_mounts */
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
diff --git a/fs/namespace.c b/fs/namespace.c
index e6a7e769d25d..3c1ee5b5bb69 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -882,6 +882,8 @@ static struct mountpoint *unhash_mnt(struct mount *mnt)
struct mountpoint *mp;
mnt->mnt_parent = mnt;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
+ if (!list_empty(&mnt->mnt_child))
+ mnt->mnt_parent->mnt_mounts_cnt--;
list_del_init(&mnt->mnt_child);
hlist_del_init_rcu(&mnt->mnt_hash);
hlist_del_init(&mnt->mnt_mp_list);
@@ -918,6 +920,7 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent)
hlist_add_head_rcu(&mnt->mnt_hash,
m_hash(&parent->mnt, mnt->mnt_mountpoint));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+ parent->mnt_mounts_cnt++;
}

/*
@@ -936,6 +939,8 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
struct mountpoint *old_mp = mnt->mnt_mp;
struct mount *old_parent = mnt->mnt_parent;

+ if (!list_empty(&mnt->mnt_child))
+ mnt->mnt_parent->mnt_mounts_cnt--;
list_del_init(&mnt->mnt_child);
hlist_del_init(&mnt->mnt_mp_list);
hlist_del_init_rcu(&mnt->mnt_hash);
@@ -1562,6 +1567,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)

/* Hide the mounts from mnt_mounts */
list_for_each_entry(p, &tmp_list, mnt_list) {
+ if (!list_empty(&p->mnt_child))
+ p->mnt_parent->mnt_mounts_cnt--;
list_del_init(&p->mnt_child);
}

@@ -1590,6 +1597,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
if (!disconnect) {
/* Don't forget about p */
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
+ p->mnt_parent->mnt_mounts_cnt++;
} else {
umount_mnt(p);
}


2022-07-20 02:05:15

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH 1/3] vfs: track count of child mounts

On Mon, Jul 11, 2022 at 11:37:40AM +0800, Ian Kent wrote:
> While the total reference count of a mount is mostly all that's needed
> the reference count corresponding to the mounts only is occassionally
> also needed (for example, autofs checking if a tree of mounts can be
> expired).
>
> To make this reference count avaialble with minimal changes add a
> counter to track the number of child mounts under a given mount. This
> count can then be used to calculate the mounts only reference count.

No. This is a wrong approach - instead of keeping track of number of
children, we should just stop having them contribute to refcount of
the parent. Here's what I've got in my local tree; life gets simpler
that way.

commit e99f1f9cc864103f326a5352e6ce1e377613437f
Author: Al Viro <[email protected]>
Date: Sat Jul 9 14:45:39 2022 -0400

namespace: don't keep ->mnt_parent pinned

makes refcounting more consistent

Signed-off-by: Al Viro <[email protected]>

diff --git a/fs/namespace.c b/fs/namespace.c
index 68789f896f08..53c29110a0cd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -906,7 +906,6 @@ void mnt_set_mountpoint(struct mount *mnt,
struct mount *child_mnt)
{
mp->m_count++;
- mnt_add_count(mnt, 1); /* essentially, that's mntget */
child_mnt->mnt_mountpoint = mp->m_dentry;
child_mnt->mnt_parent = mnt;
child_mnt->mnt_mp = mp;
@@ -1429,22 +1428,18 @@ void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
int may_umount_tree(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
- int actual_refs = 0;
- int minimum_refs = 0;
- struct mount *p;
BUG_ON(!m);

/* write lock needed for mnt_get_count */
lock_mount_hash();
- for (p = mnt; p; p = next_mnt(p, mnt)) {
- actual_refs += mnt_get_count(p);
- minimum_refs += 2;
+ for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) {
+ int allowed = p == mnt ? 2 : 1;
+ if (mnt_get_count(p) > allowed) {
+ unlock_mount_hash();
+ return 0;
+ }
}
unlock_mount_hash();
-
- if (actual_refs > minimum_refs)
- return 0;
-
return 1;
}

@@ -1586,7 +1581,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)

disconnect = disconnect_mount(p, how);
if (mnt_has_parent(p)) {
- mnt_add_count(p->mnt_parent, -1);
if (!disconnect) {
/* Don't forget about p */
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
@@ -2892,12 +2886,8 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
put_mountpoint(old_mp);
out:
unlock_mount(mp);
- if (!err) {
- if (attached)
- mntput_no_expire(parent);
- else
- free_mnt_ns(ns);
- }
+ if (!err && !attached)
+ free_mnt_ns(ns);
return err;
}

@@ -3869,7 +3859,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
const char __user *, put_old)
{
struct path new, old, root;
- struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
+ struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent;
struct mountpoint *old_mp, *root_mp;
int error;

@@ -3900,10 +3890,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
new_mnt = real_mount(new.mnt);
root_mnt = real_mount(root.mnt);
old_mnt = real_mount(old.mnt);
- ex_parent = new_mnt->mnt_parent;
root_parent = root_mnt->mnt_parent;
if (IS_MNT_SHARED(old_mnt) ||
- IS_MNT_SHARED(ex_parent) ||
+ IS_MNT_SHARED(new_mnt->mnt_parent) ||
IS_MNT_SHARED(root_parent))
goto out4;
if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
@@ -3942,7 +3931,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
attach_mnt(root_mnt, old_mnt, old_mp);
/* mount new_root on / */
attach_mnt(new_mnt, root_parent, root_mp);
- mnt_add_count(root_parent, -1);
touch_mnt_namespace(current->nsproxy->mnt_ns);
/* A moved mount should not expire automatically */
list_del_init(&new_mnt->mnt_expire);
@@ -3952,8 +3940,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
error = 0;
out4:
unlock_mount(old_mp);
- if (!error)
- mntput_no_expire(ex_parent);
out3:
path_put(&root);
out2:
diff --git a/fs/pnode.c b/fs/pnode.c
index 1106137c747a..e2c8a4b18857 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -368,7 +368,7 @@ static inline int do_refcount_check(struct mount *mnt, int count)
*/
int propagate_mount_busy(struct mount *mnt, int refcnt)
{
- struct mount *m, *child, *topper;
+ struct mount *m, *child;
struct mount *parent = mnt->mnt_parent;

if (mnt == parent)
@@ -384,7 +384,6 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)

for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
- int count = 1;
child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
if (!child)
continue;
@@ -392,13 +391,10 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
/* Is there exactly one mount on the child that covers
* it completely whose reference should be ignored?
*/
- topper = find_topper(child);
- if (topper)
- count += 1;
- else if (!list_empty(&child->mnt_mounts))
+ if (!find_topper(child) && !list_empty(&child->mnt_mounts))
continue;

- if (do_refcount_check(child, count))
+ if (do_refcount_check(child, 1))
return 1;
}
return 0;

2022-07-20 03:05:33

by Ian Kent

[permalink] [raw]
Subject: Re: [PATCH 1/3] vfs: track count of child mounts


On 20/7/22 09:50, Al Viro wrote:
> On Mon, Jul 11, 2022 at 11:37:40AM +0800, Ian Kent wrote:
>> While the total reference count of a mount is mostly all that's needed
>> the reference count corresponding to the mounts only is occassionally
>> also needed (for example, autofs checking if a tree of mounts can be
>> expired).
>>
>> To make this reference count avaialble with minimal changes add a
>> counter to track the number of child mounts under a given mount. This
>> count can then be used to calculate the mounts only reference count.
> No. This is a wrong approach - instead of keeping track of number of
> children, we should just stop having them contribute to refcount of
> the parent. Here's what I've got in my local tree; life gets simpler
> that way.

Right, I'll grab this and run some tests.


Ian

>
> commit e99f1f9cc864103f326a5352e6ce1e377613437f
> Author: Al Viro <[email protected]>
> Date: Sat Jul 9 14:45:39 2022 -0400
>
> namespace: don't keep ->mnt_parent pinned
>
> makes refcounting more consistent
>
> Signed-off-by: Al Viro <[email protected]>
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 68789f896f08..53c29110a0cd 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -906,7 +906,6 @@ void mnt_set_mountpoint(struct mount *mnt,
> struct mount *child_mnt)
> {
> mp->m_count++;
> - mnt_add_count(mnt, 1); /* essentially, that's mntget */
> child_mnt->mnt_mountpoint = mp->m_dentry;
> child_mnt->mnt_parent = mnt;
> child_mnt->mnt_mp = mp;
> @@ -1429,22 +1428,18 @@ void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
> int may_umount_tree(struct vfsmount *m)
> {
> struct mount *mnt = real_mount(m);
> - int actual_refs = 0;
> - int minimum_refs = 0;
> - struct mount *p;
> BUG_ON(!m);
>
> /* write lock needed for mnt_get_count */
> lock_mount_hash();
> - for (p = mnt; p; p = next_mnt(p, mnt)) {
> - actual_refs += mnt_get_count(p);
> - minimum_refs += 2;
> + for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) {
> + int allowed = p == mnt ? 2 : 1;
> + if (mnt_get_count(p) > allowed) {
> + unlock_mount_hash();
> + return 0;
> + }
> }
> unlock_mount_hash();
> -
> - if (actual_refs > minimum_refs)
> - return 0;
> -
> return 1;
> }
>
> @@ -1586,7 +1581,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
>
> disconnect = disconnect_mount(p, how);
> if (mnt_has_parent(p)) {
> - mnt_add_count(p->mnt_parent, -1);
> if (!disconnect) {
> /* Don't forget about p */
> list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
> @@ -2892,12 +2886,8 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
> put_mountpoint(old_mp);
> out:
> unlock_mount(mp);
> - if (!err) {
> - if (attached)
> - mntput_no_expire(parent);
> - else
> - free_mnt_ns(ns);
> - }
> + if (!err && !attached)
> + free_mnt_ns(ns);
> return err;
> }
>
> @@ -3869,7 +3859,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
> const char __user *, put_old)
> {
> struct path new, old, root;
> - struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
> + struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent;
> struct mountpoint *old_mp, *root_mp;
> int error;
>
> @@ -3900,10 +3890,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
> new_mnt = real_mount(new.mnt);
> root_mnt = real_mount(root.mnt);
> old_mnt = real_mount(old.mnt);
> - ex_parent = new_mnt->mnt_parent;
> root_parent = root_mnt->mnt_parent;
> if (IS_MNT_SHARED(old_mnt) ||
> - IS_MNT_SHARED(ex_parent) ||
> + IS_MNT_SHARED(new_mnt->mnt_parent) ||
> IS_MNT_SHARED(root_parent))
> goto out4;
> if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
> @@ -3942,7 +3931,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
> attach_mnt(root_mnt, old_mnt, old_mp);
> /* mount new_root on / */
> attach_mnt(new_mnt, root_parent, root_mp);
> - mnt_add_count(root_parent, -1);
> touch_mnt_namespace(current->nsproxy->mnt_ns);
> /* A moved mount should not expire automatically */
> list_del_init(&new_mnt->mnt_expire);
> @@ -3952,8 +3940,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
> error = 0;
> out4:
> unlock_mount(old_mp);
> - if (!error)
> - mntput_no_expire(ex_parent);
> out3:
> path_put(&root);
> out2:
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 1106137c747a..e2c8a4b18857 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -368,7 +368,7 @@ static inline int do_refcount_check(struct mount *mnt, int count)
> */
> int propagate_mount_busy(struct mount *mnt, int refcnt)
> {
> - struct mount *m, *child, *topper;
> + struct mount *m, *child;
> struct mount *parent = mnt->mnt_parent;
>
> if (mnt == parent)
> @@ -384,7 +384,6 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
>
> for (m = propagation_next(parent, parent); m;
> m = propagation_next(m, parent)) {
> - int count = 1;
> child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
> if (!child)
> continue;
> @@ -392,13 +391,10 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
> /* Is there exactly one mount on the child that covers
> * it completely whose reference should be ignored?
> */
> - topper = find_topper(child);
> - if (topper)
> - count += 1;
> - else if (!list_empty(&child->mnt_mounts))
> + if (!find_topper(child) && !list_empty(&child->mnt_mounts))
> continue;
>
> - if (do_refcount_check(child, count))
> + if (do_refcount_check(child, 1))
> return 1;
> }
> return 0;

2022-07-20 07:54:16

by Ian Kent

[permalink] [raw]
Subject: Re: [PATCH 1/3] vfs: track count of child mounts

On 20/7/22 10:17, Ian Kent wrote:
>
> On 20/7/22 09:50, Al Viro wrote:
>> On Mon, Jul 11, 2022 at 11:37:40AM +0800, Ian Kent wrote:
>>> While the total reference count of a mount is mostly all that's needed
>>> the reference count corresponding to the mounts only is occassionally
>>> also needed (for example, autofs checking if a tree of mounts can be
>>> expired).
>>>
>>> To make this reference count avaialble with minimal changes add a
>>> counter to track the number of child mounts under a given mount. This
>>> count can then be used to calculate the mounts only reference count.
>> No.  This is a wrong approach - instead of keeping track of number of
>> children, we should just stop having them contribute to refcount of
>> the parent.  Here's what I've got in my local tree; life gets simpler
>> that way.
>
> Right, I'll grab this and run some tests.
>
>
> Ian
>
>>
>> commit e99f1f9cc864103f326a5352e6ce1e377613437f
>> Author: Al Viro <[email protected]>
>> Date:   Sat Jul 9 14:45:39 2022 -0400
>>
>>      namespace: don't keep ->mnt_parent pinned
>>           makes refcounting more consistent
>>           Signed-off-by: Al Viro <[email protected]>
>>
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 68789f896f08..53c29110a0cd 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -906,7 +906,6 @@ void mnt_set_mountpoint(struct mount *mnt,
>>               struct mount *child_mnt)
>>   {
>>       mp->m_count++;
>> -    mnt_add_count(mnt, 1);    /* essentially, that's mntget */
>>       child_mnt->mnt_mountpoint = mp->m_dentry;
>>       child_mnt->mnt_parent = mnt;
>>       child_mnt->mnt_mp = mp;
>> @@ -1429,22 +1428,18 @@ void mnt_cursor_del(struct mnt_namespace *ns,
>> struct mount *cursor)
>>   int may_umount_tree(struct vfsmount *m)
>>   {
>>       struct mount *mnt = real_mount(m);
>> -    int actual_refs = 0;
>> -    int minimum_refs = 0;
>> -    struct mount *p;
>>       BUG_ON(!m);
>>         /* write lock needed for mnt_get_count */
>>       lock_mount_hash();
>> -    for (p = mnt; p; p = next_mnt(p, mnt)) {
>> -        actual_refs += mnt_get_count(p);
>> -        minimum_refs += 2;
>> +    for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) {
>> +        int allowed = p == mnt ? 2 : 1;
>> +        if (mnt_get_count(p) > allowed) {
>> +            unlock_mount_hash();
>> +            return 0;
>> +        }
>>       }

One part of the problem I'm trying to fix is when some other

process has created a mount namespace (say with unshare(1))

on an auto-mounted path and holds the mount in use in some way.

This leads to an attempted umount in the automount daemon which

fails because a propagation of the mount is in use.


Given the way may_umount() behaves I thought it sensible that

may_umount_tree() behave that way too, but the above doesn't

check propagated mounts.


I'll see if I can come up with something on that ... unless

I'm missing something and you have different thoughts how

this should be done ...


Ian

>>       unlock_mount_hash();
>> -
>> -    if (actual_refs > minimum_refs)
>> -        return 0;
>> -
>>       return 1;
>>   }
>>   @@ -1586,7 +1581,6 @@ static void umount_tree(struct mount *mnt,
>> enum umount_tree_flags how)
>>             disconnect = disconnect_mount(p, how);
>>           if (mnt_has_parent(p)) {
>> -            mnt_add_count(p->mnt_parent, -1);
>>               if (!disconnect) {
>>                   /* Don't forget about p */
>>                   list_add_tail(&p->mnt_child,
>> &p->mnt_parent->mnt_mounts);
>> @@ -2892,12 +2886,8 @@ static int do_move_mount(struct path
>> *old_path, struct path *new_path)
>>           put_mountpoint(old_mp);
>>   out:
>>       unlock_mount(mp);
>> -    if (!err) {
>> -        if (attached)
>> -            mntput_no_expire(parent);
>> -        else
>> -            free_mnt_ns(ns);
>> -    }
>> +    if (!err && !attached)
>> +        free_mnt_ns(ns);
>>       return err;
>>   }
>>   @@ -3869,7 +3859,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user
>> *, new_root,
>>           const char __user *, put_old)
>>   {
>>       struct path new, old, root;
>> -    struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent,
>> *ex_parent;
>> +    struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent;
>>       struct mountpoint *old_mp, *root_mp;
>>       int error;
>>   @@ -3900,10 +3890,9 @@ SYSCALL_DEFINE2(pivot_root, const char
>> __user *, new_root,
>>       new_mnt = real_mount(new.mnt);
>>       root_mnt = real_mount(root.mnt);
>>       old_mnt = real_mount(old.mnt);
>> -    ex_parent = new_mnt->mnt_parent;
>>       root_parent = root_mnt->mnt_parent;
>>       if (IS_MNT_SHARED(old_mnt) ||
>> -        IS_MNT_SHARED(ex_parent) ||
>> +        IS_MNT_SHARED(new_mnt->mnt_parent) ||
>>           IS_MNT_SHARED(root_parent))
>>           goto out4;
>>       if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
>> @@ -3942,7 +3931,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user
>> *, new_root,
>>       attach_mnt(root_mnt, old_mnt, old_mp);
>>       /* mount new_root on / */
>>       attach_mnt(new_mnt, root_parent, root_mp);
>> -    mnt_add_count(root_parent, -1);
>>       touch_mnt_namespace(current->nsproxy->mnt_ns);
>>       /* A moved mount should not expire automatically */
>>       list_del_init(&new_mnt->mnt_expire);
>> @@ -3952,8 +3940,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user
>> *, new_root,
>>       error = 0;
>>   out4:
>>       unlock_mount(old_mp);
>> -    if (!error)
>> -        mntput_no_expire(ex_parent);
>>   out3:
>>       path_put(&root);
>>   out2:
>> diff --git a/fs/pnode.c b/fs/pnode.c
>> index 1106137c747a..e2c8a4b18857 100644
>> --- a/fs/pnode.c
>> +++ b/fs/pnode.c
>> @@ -368,7 +368,7 @@ static inline int do_refcount_check(struct mount
>> *mnt, int count)
>>    */
>>   int propagate_mount_busy(struct mount *mnt, int refcnt)
>>   {
>> -    struct mount *m, *child, *topper;
>> +    struct mount *m, *child;
>>       struct mount *parent = mnt->mnt_parent;
>>         if (mnt == parent)
>> @@ -384,7 +384,6 @@ int propagate_mount_busy(struct mount *mnt, int
>> refcnt)
>>         for (m = propagation_next(parent, parent); m;
>>                    m = propagation_next(m, parent)) {
>> -        int count = 1;
>>           child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>>           if (!child)
>>               continue;
>> @@ -392,13 +391,10 @@ int propagate_mount_busy(struct mount *mnt, int
>> refcnt)
>>           /* Is there exactly one mount on the child that covers
>>            * it completely whose reference should be ignored?
>>            */
>> -        topper = find_topper(child);
>> -        if (topper)
>> -            count += 1;
>> -        else if (!list_empty(&child->mnt_mounts))
>> +        if (!find_topper(child) && !list_empty(&child->mnt_mounts))
>>               continue;
>>   -        if (do_refcount_check(child, count))
>> +        if (do_refcount_check(child, 1))
>>               return 1;
>>       }
>>       return 0;

2022-07-26 05:23:25

by Ian Kent

[permalink] [raw]
Subject: Re: [PATCH 1/3] vfs: track count of child mounts

On 20/7/22 10:17, Ian Kent wrote:
>
> On 20/7/22 09:50, Al Viro wrote:
>> On Mon, Jul 11, 2022 at 11:37:40AM +0800, Ian Kent wrote:
>>> While the total reference count of a mount is mostly all that's needed
>>> the reference count corresponding to the mounts only is occassionally
>>> also needed (for example, autofs checking if a tree of mounts can be
>>> expired).
>>>
>>> To make this reference count avaialble with minimal changes add a
>>> counter to track the number of child mounts under a given mount. This
>>> count can then be used to calculate the mounts only reference count.
>> No.  This is a wrong approach - instead of keeping track of number of
>> children, we should just stop having them contribute to refcount of
>> the parent.  Here's what I've got in my local tree; life gets simpler
>> that way.
>
> Right, I'll grab this and run some tests.

Just a heads up, I've been able to reliably hang autofs with the

below patch using my submount test (which is actually pretty good

at exposing problems).


No idea what it is yet but I'll look around and keep trying to work

it out, ;)


Ian

>
>
> Ian
>
>>
>> commit e99f1f9cc864103f326a5352e6ce1e377613437f
>> Author: Al Viro <[email protected]>
>> Date:   Sat Jul 9 14:45:39 2022 -0400
>>
>>      namespace: don't keep ->mnt_parent pinned
>>           makes refcounting more consistent
>>           Signed-off-by: Al Viro <[email protected]>
>>
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 68789f896f08..53c29110a0cd 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -906,7 +906,6 @@ void mnt_set_mountpoint(struct mount *mnt,
>>               struct mount *child_mnt)
>>   {
>>       mp->m_count++;
>> -    mnt_add_count(mnt, 1);    /* essentially, that's mntget */
>>       child_mnt->mnt_mountpoint = mp->m_dentry;
>>       child_mnt->mnt_parent = mnt;
>>       child_mnt->mnt_mp = mp;
>> @@ -1429,22 +1428,18 @@ void mnt_cursor_del(struct mnt_namespace *ns,
>> struct mount *cursor)
>>   int may_umount_tree(struct vfsmount *m)
>>   {
>>       struct mount *mnt = real_mount(m);
>> -    int actual_refs = 0;
>> -    int minimum_refs = 0;
>> -    struct mount *p;
>>       BUG_ON(!m);
>>         /* write lock needed for mnt_get_count */
>>       lock_mount_hash();
>> -    for (p = mnt; p; p = next_mnt(p, mnt)) {
>> -        actual_refs += mnt_get_count(p);
>> -        minimum_refs += 2;
>> +    for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) {
>> +        int allowed = p == mnt ? 2 : 1;
>> +        if (mnt_get_count(p) > allowed) {
>> +            unlock_mount_hash();
>> +            return 0;
>> +        }
>>       }
>>       unlock_mount_hash();
>> -
>> -    if (actual_refs > minimum_refs)
>> -        return 0;
>> -
>>       return 1;
>>   }
>>   @@ -1586,7 +1581,6 @@ static void umount_tree(struct mount *mnt,
>> enum umount_tree_flags how)
>>             disconnect = disconnect_mount(p, how);
>>           if (mnt_has_parent(p)) {
>> -            mnt_add_count(p->mnt_parent, -1);
>>               if (!disconnect) {
>>                   /* Don't forget about p */
>>                   list_add_tail(&p->mnt_child,
>> &p->mnt_parent->mnt_mounts);
>> @@ -2892,12 +2886,8 @@ static int do_move_mount(struct path
>> *old_path, struct path *new_path)
>>           put_mountpoint(old_mp);
>>   out:
>>       unlock_mount(mp);
>> -    if (!err) {
>> -        if (attached)
>> -            mntput_no_expire(parent);
>> -        else
>> -            free_mnt_ns(ns);
>> -    }
>> +    if (!err && !attached)
>> +        free_mnt_ns(ns);
>>       return err;
>>   }
>>   @@ -3869,7 +3859,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user
>> *, new_root,
>>           const char __user *, put_old)
>>   {
>>       struct path new, old, root;
>> -    struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent,
>> *ex_parent;
>> +    struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent;
>>       struct mountpoint *old_mp, *root_mp;
>>       int error;
>>   @@ -3900,10 +3890,9 @@ SYSCALL_DEFINE2(pivot_root, const char
>> __user *, new_root,
>>       new_mnt = real_mount(new.mnt);
>>       root_mnt = real_mount(root.mnt);
>>       old_mnt = real_mount(old.mnt);
>> -    ex_parent = new_mnt->mnt_parent;
>>       root_parent = root_mnt->mnt_parent;
>>       if (IS_MNT_SHARED(old_mnt) ||
>> -        IS_MNT_SHARED(ex_parent) ||
>> +        IS_MNT_SHARED(new_mnt->mnt_parent) ||
>>           IS_MNT_SHARED(root_parent))
>>           goto out4;
>>       if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
>> @@ -3942,7 +3931,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user
>> *, new_root,
>>       attach_mnt(root_mnt, old_mnt, old_mp);
>>       /* mount new_root on / */
>>       attach_mnt(new_mnt, root_parent, root_mp);
>> -    mnt_add_count(root_parent, -1);
>>       touch_mnt_namespace(current->nsproxy->mnt_ns);
>>       /* A moved mount should not expire automatically */
>>       list_del_init(&new_mnt->mnt_expire);
>> @@ -3952,8 +3940,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user
>> *, new_root,
>>       error = 0;
>>   out4:
>>       unlock_mount(old_mp);
>> -    if (!error)
>> -        mntput_no_expire(ex_parent);
>>   out3:
>>       path_put(&root);
>>   out2:
>> diff --git a/fs/pnode.c b/fs/pnode.c
>> index 1106137c747a..e2c8a4b18857 100644
>> --- a/fs/pnode.c
>> +++ b/fs/pnode.c
>> @@ -368,7 +368,7 @@ static inline int do_refcount_check(struct mount
>> *mnt, int count)
>>    */
>>   int propagate_mount_busy(struct mount *mnt, int refcnt)
>>   {
>> -    struct mount *m, *child, *topper;
>> +    struct mount *m, *child;
>>       struct mount *parent = mnt->mnt_parent;
>>         if (mnt == parent)
>> @@ -384,7 +384,6 @@ int propagate_mount_busy(struct mount *mnt, int
>> refcnt)
>>         for (m = propagation_next(parent, parent); m;
>>                    m = propagation_next(m, parent)) {
>> -        int count = 1;
>>           child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>>           if (!child)
>>               continue;
>> @@ -392,13 +391,10 @@ int propagate_mount_busy(struct mount *mnt, int
>> refcnt)
>>           /* Is there exactly one mount on the child that covers
>>            * it completely whose reference should be ignored?
>>            */
>> -        topper = find_topper(child);
>> -        if (topper)
>> -            count += 1;
>> -        else if (!list_empty(&child->mnt_mounts))
>> +        if (!find_topper(child) && !list_empty(&child->mnt_mounts))
>>               continue;
>>   -        if (do_refcount_check(child, count))
>> +        if (do_refcount_check(child, 1))
>>               return 1;
>>       }
>>       return 0;

2022-07-26 07:20:43

by Ian Kent

[permalink] [raw]
Subject: Re: [PATCH 1/3] vfs: track count of child mounts


On 26/7/22 13:11, Ian Kent wrote:
> On 20/7/22 10:17, Ian Kent wrote:
>>
>> On 20/7/22 09:50, Al Viro wrote:
>>> On Mon, Jul 11, 2022 at 11:37:40AM +0800, Ian Kent wrote:
>>>> While the total reference count of a mount is mostly all that's needed
>>>> the reference count corresponding to the mounts only is occassionally
>>>> also needed (for example, autofs checking if a tree of mounts can be
>>>> expired).
>>>>
>>>> To make this reference count avaialble with minimal changes add a
>>>> counter to track the number of child mounts under a given mount. This
>>>> count can then be used to calculate the mounts only reference count.
>>> No.  This is a wrong approach - instead of keeping track of number of
>>> children, we should just stop having them contribute to refcount of
>>> the parent.  Here's what I've got in my local tree; life gets simpler
>>> that way.
>>
>> Right, I'll grab this and run some tests.
>
> Just a heads up, I've been able to reliably hang autofs with the
>
> below patch using my submount test (which is actually pretty good
>
> at exposing problems).
>
>
> No idea what it is yet but I'll look around and keep trying to work
>
> it out, ;)

Mmm ... so it's just slower ... that was unexpected ...


>
>
> Ian
>
>>
>>
>> Ian
>>
>>>
>>> commit e99f1f9cc864103f326a5352e6ce1e377613437f
>>> Author: Al Viro <[email protected]>
>>> Date:   Sat Jul 9 14:45:39 2022 -0400
>>>
>>>      namespace: don't keep ->mnt_parent pinned
>>>           makes refcounting more consistent
>>>           Signed-off-by: Al Viro <[email protected]>
>>>
>>> diff --git a/fs/namespace.c b/fs/namespace.c
>>> index 68789f896f08..53c29110a0cd 100644
>>> --- a/fs/namespace.c
>>> +++ b/fs/namespace.c
>>> @@ -906,7 +906,6 @@ void mnt_set_mountpoint(struct mount *mnt,
>>>               struct mount *child_mnt)
>>>   {
>>>       mp->m_count++;
>>> -    mnt_add_count(mnt, 1);    /* essentially, that's mntget */
>>>       child_mnt->mnt_mountpoint = mp->m_dentry;
>>>       child_mnt->mnt_parent = mnt;
>>>       child_mnt->mnt_mp = mp;
>>> @@ -1429,22 +1428,18 @@ void mnt_cursor_del(struct mnt_namespace
>>> *ns, struct mount *cursor)
>>>   int may_umount_tree(struct vfsmount *m)
>>>   {
>>>       struct mount *mnt = real_mount(m);
>>> -    int actual_refs = 0;
>>> -    int minimum_refs = 0;
>>> -    struct mount *p;
>>>       BUG_ON(!m);
>>>         /* write lock needed for mnt_get_count */
>>>       lock_mount_hash();
>>> -    for (p = mnt; p; p = next_mnt(p, mnt)) {
>>> -        actual_refs += mnt_get_count(p);
>>> -        minimum_refs += 2;
>>> +    for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) {
>>> +        int allowed = p == mnt ? 2 : 1;
>>> +        if (mnt_get_count(p) > allowed) {
>>> +            unlock_mount_hash();
>>> +            return 0;
>>> +        }
>>>       }
>>>       unlock_mount_hash();
>>> -
>>> -    if (actual_refs > minimum_refs)
>>> -        return 0;
>>> -
>>>       return 1;
>>>   }
>>>   @@ -1586,7 +1581,6 @@ static void umount_tree(struct mount *mnt,
>>> enum umount_tree_flags how)
>>>             disconnect = disconnect_mount(p, how);
>>>           if (mnt_has_parent(p)) {
>>> -            mnt_add_count(p->mnt_parent, -1);
>>>               if (!disconnect) {
>>>                   /* Don't forget about p */
>>>                   list_add_tail(&p->mnt_child,
>>> &p->mnt_parent->mnt_mounts);
>>> @@ -2892,12 +2886,8 @@ static int do_move_mount(struct path
>>> *old_path, struct path *new_path)
>>>           put_mountpoint(old_mp);
>>>   out:
>>>       unlock_mount(mp);
>>> -    if (!err) {
>>> -        if (attached)
>>> -            mntput_no_expire(parent);
>>> -        else
>>> -            free_mnt_ns(ns);
>>> -    }
>>> +    if (!err && !attached)
>>> +        free_mnt_ns(ns);
>>>       return err;
>>>   }
>>>   @@ -3869,7 +3859,7 @@ SYSCALL_DEFINE2(pivot_root, const char
>>> __user *, new_root,
>>>           const char __user *, put_old)
>>>   {
>>>       struct path new, old, root;
>>> -    struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent,
>>> *ex_parent;
>>> +    struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent;
>>>       struct mountpoint *old_mp, *root_mp;
>>>       int error;
>>>   @@ -3900,10 +3890,9 @@ SYSCALL_DEFINE2(pivot_root, const char
>>> __user *, new_root,
>>>       new_mnt = real_mount(new.mnt);
>>>       root_mnt = real_mount(root.mnt);
>>>       old_mnt = real_mount(old.mnt);
>>> -    ex_parent = new_mnt->mnt_parent;
>>>       root_parent = root_mnt->mnt_parent;
>>>       if (IS_MNT_SHARED(old_mnt) ||
>>> -        IS_MNT_SHARED(ex_parent) ||
>>> +        IS_MNT_SHARED(new_mnt->mnt_parent) ||
>>>           IS_MNT_SHARED(root_parent))
>>>           goto out4;
>>>       if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
>>> @@ -3942,7 +3931,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user
>>> *, new_root,
>>>       attach_mnt(root_mnt, old_mnt, old_mp);
>>>       /* mount new_root on / */
>>>       attach_mnt(new_mnt, root_parent, root_mp);
>>> -    mnt_add_count(root_parent, -1);
>>>       touch_mnt_namespace(current->nsproxy->mnt_ns);
>>>       /* A moved mount should not expire automatically */
>>>       list_del_init(&new_mnt->mnt_expire);
>>> @@ -3952,8 +3940,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user
>>> *, new_root,
>>>       error = 0;
>>>   out4:
>>>       unlock_mount(old_mp);
>>> -    if (!error)
>>> -        mntput_no_expire(ex_parent);
>>>   out3:
>>>       path_put(&root);
>>>   out2:
>>> diff --git a/fs/pnode.c b/fs/pnode.c
>>> index 1106137c747a..e2c8a4b18857 100644
>>> --- a/fs/pnode.c
>>> +++ b/fs/pnode.c
>>> @@ -368,7 +368,7 @@ static inline int do_refcount_check(struct mount
>>> *mnt, int count)
>>>    */
>>>   int propagate_mount_busy(struct mount *mnt, int refcnt)
>>>   {
>>> -    struct mount *m, *child, *topper;
>>> +    struct mount *m, *child;
>>>       struct mount *parent = mnt->mnt_parent;
>>>         if (mnt == parent)
>>> @@ -384,7 +384,6 @@ int propagate_mount_busy(struct mount *mnt, int
>>> refcnt)
>>>         for (m = propagation_next(parent, parent); m;
>>>                    m = propagation_next(m, parent)) {
>>> -        int count = 1;
>>>           child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
>>>           if (!child)
>>>               continue;
>>> @@ -392,13 +391,10 @@ int propagate_mount_busy(struct mount *mnt,
>>> int refcnt)
>>>           /* Is there exactly one mount on the child that covers
>>>            * it completely whose reference should be ignored?
>>>            */
>>> -        topper = find_topper(child);
>>> -        if (topper)
>>> -            count += 1;
>>> -        else if (!list_empty(&child->mnt_mounts))
>>> +        if (!find_topper(child) && !list_empty(&child->mnt_mounts))
>>>               continue;
>>>   -        if (do_refcount_check(child, count))
>>> +        if (do_refcount_check(child, 1))
>>>               return 1;
>>>       }
>>>       return 0;