The pid_revalidate() function requires dropping from RCU into REF lookup
mode. When many threads are resolving paths within /proc in parallel,
this can result in heavy spinlock contention as each thread tries to
grab a reference to the /proc dentry lock (and drop it shortly
thereafter).
Allow the pid_revalidate() function to execute under LOOKUP_RCU. When
updates must be made to the inode due to the owning task performing
setuid(), drop out of RCU and into REF mode. Remove the call to
security_task_to_inode(), since we can rely on the call from
proc_pid_make_inode().
Signed-off-by: Stephen Brennan <[email protected]>
---
I'd like to use this patch as an RFC on this approach for reducing spinlock
contention during many parallel path lookups in the /proc filesystem. The
contention can be triggered by, for example, running ~100 parallel instances of
"TZ=/etc/localtime ps -fe >/dev/null" on a 100CPU machine. The %sys utilization
in such a case reaches around 90%, and profiles show two code paths with high
utilization:
walk_component
lookup_fast
unlazy_child
legitimize_root
__legitimize_path
lockref_get_not_dead
terminate_walk
dput
dput
By applying this patch, %sys utilization falls to around 60% under the same
workload. Although this particular workload is a bit contrived, we have seen
some monitoring scripts which produced high %sys time due to this contention.
Changes from v2:
- Remove get_pid_task_rcu_user() and get_proc_task_rcu(), since they were
unnecessary.
- Remove the call to security_task_to_inode().
fs/proc/base.c | 47 +++++++++++++++++++++++++++++++++--------------
1 file changed, 33 insertions(+), 14 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ebea9501afb8..833d55a59e20 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1813,12 +1813,28 @@ int pid_getattr(const struct path *path, struct kstat *stat,
/*
* Set <pid>/... inode ownership (can change due to setuid(), etc.)
*/
-void pid_update_inode(struct task_struct *task, struct inode *inode)
+static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
+ unsigned int flags)
{
- task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
+ kuid_t uid;
+ kgid_t gid;
+
+ task_dump_owner(task, inode->i_mode, &uid, &gid);
+ if (uid_eq(uid, inode->i_uid) && gid_eq(gid, inode->i_gid) &&
+ !(inode->i_mode & (S_ISUID | S_ISGID)))
+ return 1;
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ inode->i_uid = uid;
+ inode->i_gid = gid;
inode->i_mode &= ~(S_ISUID | S_ISGID);
- security_task_to_inode(task, inode);
+ return 1;
+}
+
+void pid_update_inode(struct task_struct *task, struct inode *inode)
+{
+ do_pid_update_inode(task, inode, 0);
}
/*
@@ -1830,19 +1846,22 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
struct task_struct *task;
+ int rv = 0;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- inode = d_inode(dentry);
- task = get_proc_task(inode);
-
- if (task) {
- pid_update_inode(task, inode);
- put_task_struct(task);
- return 1;
+ if (flags & LOOKUP_RCU) {
+ inode = d_inode_rcu(dentry);
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (task)
+ rv = do_pid_update_inode(task, inode, flags);
+ } else {
+ inode = d_inode(dentry);
+ task = get_proc_task(inode);
+ if (task) {
+ rv = do_pid_update_inode(task, inode, flags);
+ put_task_struct(task);
+ }
}
- return 0;
+ return rv;
}
static inline bool proc_inode_is_dead(struct inode *inode)
--
2.25.1
On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
> -void pid_update_inode(struct task_struct *task, struct inode *inode)
> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
> + unsigned int flags)
I'm really nitpicking here, but this function only _updates_ the inode
if flags says it should. So I was thinking something like this
(compile tested only).
I'd really appreocate feedback from someone like Casey or Stephen on
what they need for their security modules.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b362523a9829..771f330bfce7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1968,6 +1968,25 @@ void pid_update_inode(struct task_struct *task, struct inode *inode)
security_task_to_inode(task, inode);
}
+/* See if we can avoid the above call. Assumes RCU lock held */
+static bool inode_needs_pid_update(struct task_struct *task,
+ const struct inode *inode)
+{
+ kuid_t uid;
+ kgid_t gid;
+
+ if (inode->i_mode & (S_ISUID | S_ISGID))
+ return true;
+ task_dump_owner(task, inode->i_mode, &uid, &gid);
+ if (!uid_eq(uid, inode->i_uid) || !gid_eq(gid, inode->i_gid))
+ return true;
+ /*
+ * XXX: Do we need to call the security system here to see if
+ * there's a pending update?
+ */
+ return false;
+}
+
/*
* Rewrite the inode's ownerships here because the owning task may have
* performed a setuid(), etc.
@@ -1978,8 +1997,15 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
struct inode *inode;
struct task_struct *task;
- if (flags & LOOKUP_RCU)
+ if (flags & LOOKUP_RCU) {
+ inode = d_inode_rcu(dentry);
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (!task)
+ return 0;
+ if (!inode_needs_pid_update(task, inode))
+ return 1;
return -ECHILD;
+ }
inode = d_inode(dentry);
task = get_proc_task(inode);
Stephen Brennan <[email protected]> writes:
> The pid_revalidate() function requires dropping from RCU into REF lookup
> mode. When many threads are resolving paths within /proc in parallel,
> this can result in heavy spinlock contention as each thread tries to
> grab a reference to the /proc dentry lock (and drop it shortly
> thereafter).
I am feeling dense at the moment. Which lock specifically are you
referring to? The only locks I can thinking of are sleeping locks,
not spinlocks.
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index ebea9501afb8..833d55a59e20 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1830,19 +1846,22 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
> {
> struct inode *inode;
> struct task_struct *task;
> + int rv = 0;
>
> - if (flags & LOOKUP_RCU)
> - return -ECHILD;
> -
> - inode = d_inode(dentry);
> - task = get_proc_task(inode);
> -
> - if (task) {
> - pid_update_inode(task, inode);
> - put_task_struct(task);
> - return 1;
> + if (flags & LOOKUP_RCU) {
Why do we need to test flags here at all?
Why can't the code simply take an rcu_read_lock unconditionally and just
pass flags into do_pid_update_inode?
> + inode = d_inode_rcu(dentry);
> + task = pid_task(proc_pid(inode), PIDTYPE_PID);
> + if (task)
> + rv = do_pid_update_inode(task, inode, flags);
> + } else {
> + inode = d_inode(dentry);
> + task = get_proc_task(inode);
> + if (task) {
> + rv = do_pid_update_inode(task, inode, flags);
> + put_task_struct(task);
> + }
> }
> - return 0;
> + return rv;
> }
>
> static inline bool proc_inode_is_dead(struct inode *inode)
Eric
Matthew Wilcox <[email protected]> writes:
> On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
>> -void pid_update_inode(struct task_struct *task, struct inode *inode)
>> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
>> + unsigned int flags)
>
> I'm really nitpicking here, but this function only _updates_ the inode
> if flags says it should. So I was thinking something like this
> (compile tested only).
>
> I'd really appreocate feedback from someone like Casey or Stephen on
> what they need for their security modules.
Just so we don't have security module questions confusing things
can we please make this a 2 patch series? With the first
patch removing security_task_to_inode?
The justification for the removal is that all security_task_to_inode
appears to care about is the file type bits in inode->i_mode. Something
that never changes. Having this in a separate patch would make that
logical change easier to verify.
Eric
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index b362523a9829..771f330bfce7 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1968,6 +1968,25 @@ void pid_update_inode(struct task_struct *task, struct inode *inode)
> security_task_to_inode(task, inode);
> }
>
> +/* See if we can avoid the above call. Assumes RCU lock held */
> +static bool inode_needs_pid_update(struct task_struct *task,
> + const struct inode *inode)
> +{
> + kuid_t uid;
> + kgid_t gid;
> +
> + if (inode->i_mode & (S_ISUID | S_ISGID))
> + return true;
> + task_dump_owner(task, inode->i_mode, &uid, &gid);
> + if (!uid_eq(uid, inode->i_uid) || !gid_eq(gid, inode->i_gid))
> + return true;
> + /*
> + * XXX: Do we need to call the security system here to see if
> + * there's a pending update?
> + */
> + return false;
> +}
> +
> /*
> * Rewrite the inode's ownerships here because the owning task may have
> * performed a setuid(), etc.
> @@ -1978,8 +1997,15 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
> struct inode *inode;
> struct task_struct *task;
>
> - if (flags & LOOKUP_RCU)
> + if (flags & LOOKUP_RCU) {
> + inode = d_inode_rcu(dentry);
> + task = pid_task(proc_pid(inode), PIDTYPE_PID);
> + if (!task)
> + return 0;
> + if (!inode_needs_pid_update(task, inode))
> + return 1;
> return -ECHILD;
> + }
>
> inode = d_inode(dentry);
> task = get_proc_task(inode);
On Sun, Dec 13, 2020 at 08:22:32AM -0600, Eric W. Biederman wrote:
> Matthew Wilcox <[email protected]> writes:
>
> > On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
> >> -void pid_update_inode(struct task_struct *task, struct inode *inode)
> >> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
> >> + unsigned int flags)
> >
> > I'm really nitpicking here, but this function only _updates_ the inode
> > if flags says it should. So I was thinking something like this
> > (compile tested only).
> >
> > I'd really appreocate feedback from someone like Casey or Stephen on
> > what they need for their security modules.
>
> Just so we don't have security module questions confusing things
> can we please make this a 2 patch series? With the first
> patch removing security_task_to_inode?
>
> The justification for the removal is that all security_task_to_inode
> appears to care about is the file type bits in inode->i_mode. Something
> that never changes. Having this in a separate patch would make that
> logical change easier to verify.
I don't think that's right, which is why I keep asking Stephen & Casey
for their thoughts. For example,
* Sets the smack pointer in the inode security blob
*/
static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
{
struct inode_smack *isp = smack_inode(inode);
struct smack_known *skp = smk_of_task_struct(p);
isp->smk_inode = skp;
isp->smk_flags |= SMK_INODE_INSTANT;
}
That seems to do rather more than checking the file type bits.
On Sun, Dec 13, 2020 at 08:30:40AM -0600, Eric W. Biederman wrote:
> Stephen Brennan <[email protected]> writes:
>
> > The pid_revalidate() function requires dropping from RCU into REF lookup
> > mode. When many threads are resolving paths within /proc in parallel,
> > this can result in heavy spinlock contention as each thread tries to
> > grab a reference to the /proc dentry lock (and drop it shortly
> > thereafter).
>
> I am feeling dense at the moment. Which lock specifically are you
> referring to? The only locks I can thinking of are sleeping locks,
> not spinlocks.
Stephen may have a better answer than this, but our mutex implementation
spins if the owner is still running, so he may have misspoken slightly.
He's testing on a giant system with hundreds of CPUs, so a mutex is
going to behave like a spinlock for him.
> Why do we need to test flags here at all?
> Why can't the code simply take an rcu_read_lock unconditionally and just
> pass flags into do_pid_update_inode?
Hah! I was thinking about that possibility this morning, and I was
going to ask you that question.
On 12/13/2020 3:00 PM, Paul Moore wrote:
> On Sun, Dec 13, 2020 at 11:30 AM Matthew Wilcox <[email protected]> wrote:
>> On Sun, Dec 13, 2020 at 08:22:32AM -0600, Eric W. Biederman wrote:
>>> Matthew Wilcox <[email protected]> writes:
>>>
>>>> On Thu, Dec 03, 2020 at 04:02:12PM -0800, Stephen Brennan wrote:
>>>>> -void pid_update_inode(struct task_struct *task, struct inode *inode)
>>>>> +static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
>>>>> + unsigned int flags)
>>>> I'm really nitpicking here, but this function only _updates_ the inode
>>>> if flags says it should. So I was thinking something like this
>>>> (compile tested only).
>>>>
>>>> I'd really appreocate feedback from someone like Casey or Stephen on
>>>> what they need for their security modules.
>>> Just so we don't have security module questions confusing things
>>> can we please make this a 2 patch series? With the first
>>> patch removing security_task_to_inode?
>>>
>>> The justification for the removal is that all security_task_to_inode
>>> appears to care about is the file type bits in inode->i_mode. Something
>>> that never changes. Having this in a separate patch would make that
>>> logical change easier to verify.
>> I don't think that's right, which is why I keep asking Stephen & Casey
>> for their thoughts.
> The SELinux security_task_to_inode() implementation only cares about
> inode->i_mode S_IFMT bits from the inode so that we can set the object
> class correctly. The inode's SELinux label is taken from the
> associated task.
>
> Casey would need to comment on Smack's needs.
SELinux uses different "class"es on subjects and objects.
Smack does not differentiate, so knows the label it wants
the inode to have when smack_task_to_inode() is called,
and sets it accordingly. Nothing is allocated in the process,
and the new value is coming from the Smack master label list.
It isn't going to go away. It appears that this is the point
of the hook. Am I missing something?
>
>> For example,
>>
>> * Sets the smack pointer in the inode security blob
>> */
>> static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
>> {
>> struct inode_smack *isp = smack_inode(inode);
>> struct smack_known *skp = smk_of_task_struct(p);
>>
>> isp->smk_inode = skp;
>> isp->smk_flags |= SMK_INODE_INSTANT;
>> }
>>
>> That seems to do rather more than checking the file type bits.