2020-09-04 16:02:55

by Christian Göttsche

[permalink] [raw]
Subject: [RFC PATCH] sched: only issue an audit on privileged operation

sched_setattr(2) does via kernel/sched/core.c:__sched_setscheduler()
issue a CAP_SYS_NICE audit event unconditionally, even when the requested
operation does not require that capability / is un-privileged.

Perform privilged/unprivileged catigorization first and perform a
capable test only if needed.

Signed-off-by: Christian Göttsche <[email protected]>
---
kernel/sched/core.c | 65 ++++++++++++++++++++++++++++++++-------------
1 file changed, 47 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8471a0f7eb32..954f968d2466 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5249,13 +5249,19 @@ static int __sched_setscheduler(struct task_struct *p,
return -EINVAL;

/*
- * Allow unprivileged RT tasks to decrease priority:
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed to avoid audit
+ * event on non-privileged operations:
*/
- if (user && !capable(CAP_SYS_NICE)) {
+ if (user) {
if (fair_policy(policy)) {
if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
+ !can_nice(p, attr->sched_nice)) {
+ if (capable(CAP_SYS_NICE))
+ goto sys_nice_capable;
+ else
+ return -EPERM;
+ }
}

if (rt_policy(policy)) {
@@ -5263,13 +5269,21 @@ static int __sched_setscheduler(struct task_struct *p,
task_rlimit(p, RLIMIT_RTPRIO);

/* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
+ if (policy != p->policy && !rlim_rtprio) {
+ if (capable(CAP_SYS_NICE))
+ goto sys_nice_capable;
+ else
+ return -EPERM;
+ }

/* Can't increase priority: */
if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
+ attr->sched_priority > rlim_rtprio) {
+ if (capable(CAP_SYS_NICE))
+ goto sys_nice_capable;
+ else
+ return -EPERM;
+ }
}

/*
@@ -5278,28 +5292,43 @@ static int __sched_setscheduler(struct task_struct *p,
* unprivileged DL tasks to increase their relative deadline
* or reduce their runtime (both ways reducing utilization)
*/
- if (dl_policy(policy))
- return -EPERM;
+ if (dl_policy(policy)) {
+ if (capable(CAP_SYS_NICE))
+ goto sys_nice_capable;
+ else
+ return -EPERM;
+ }

/*
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
+ if (!can_nice(p, task_nice(p))) {
+ if (capable(CAP_SYS_NICE))
+ goto sys_nice_capable;
+ else
+ return -EPERM;
+ }
}

/* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
+ if (!check_same_owner(p)) {
+ if (capable(CAP_SYS_NICE))
+ goto sys_nice_capable;
+ else
+ return -EPERM;
+ }

/* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
+ if (p->sched_reset_on_fork && !reset_on_fork) {
+ if (capable(CAP_SYS_NICE))
+ goto sys_nice_capable;
+ else
+ return -EPERM;
+ }

- if (user) {
+sys_nice_capable:
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;

--
2.28.0


2020-09-08 10:27:47

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH] sched: only issue an audit on privileged operation

On Fri, Sep 04, 2020 at 06:00:31PM +0200, Christian G?ttsche wrote:
> sched_setattr(2) does via kernel/sched/core.c:__sched_setscheduler()
> issue a CAP_SYS_NICE audit event unconditionally, even when the requested
> operation does not require that capability / is un-privileged.
>
> Perform privilged/unprivileged catigorization first and perform a
> capable test only if needed.
>
> Signed-off-by: Christian G?ttsche <[email protected]>
> ---
> kernel/sched/core.c | 65 ++++++++++++++++++++++++++++++++-------------
> 1 file changed, 47 insertions(+), 18 deletions(-)

So who sodding cares about audit, and why is that a reason to make a
trainwreck of code?

2020-09-08 11:37:53

by Ondrej Mosnacek

[permalink] [raw]
Subject: Re: [RFC PATCH] sched: only issue an audit on privileged operation

On Tue, Sep 8, 2020 at 12:26 PM <[email protected]> wrote:
> On Fri, Sep 04, 2020 at 06:00:31PM +0200, Christian Göttsche wrote:
> > sched_setattr(2) does via kernel/sched/core.c:__sched_setscheduler()
> > issue a CAP_SYS_NICE audit event unconditionally, even when the requested
> > operation does not require that capability / is un-privileged.
> >
> > Perform privilged/unprivileged catigorization first and perform a
> > capable test only if needed.
> >
> > Signed-off-by: Christian Göttsche <[email protected]>
> > ---
> > kernel/sched/core.c | 65 ++++++++++++++++++++++++++++++++-------------
> > 1 file changed, 47 insertions(+), 18 deletions(-)
>
> So who sodding cares about audit, and why is that a reason to make a
> trainwreck of code?

The commit message should be more specific. I believe Christian is
talking about the case where SELinux or other LSM denies the
capability, in which case the denial is usually logged to the audit
log. Obviously, we don't want to get a denial logged when the
capability wasn't actually required for the operation to be allowed.

Unfortunately, the capability interface doesn't provide a way to first
get the decision value and only trigger the auditing when it was
actually used in the decision, so in complex scenarios like this the
caller needs to jump through some hoops to avoid such false-positive
denial records.

--
Ondrej Mosnacek
Software Engineer, Platform Security - SELinux kernel
Red Hat, Inc.

2022-05-03 00:35:52

by Christian Göttsche

[permalink] [raw]
Subject: [PATCH v2] [RFC PATCH] sched: only perform capability check on privileged operation

sched_setattr(2) issues via kernel/sched/core.c:__sched_setscheduler()
a CAP_SYS_NICE audit event unconditionally, even when the requested
operation does not require that capability / is unprivileged, i.e. for
reducing niceness.
This is relevant in connection with SELinux, where a capability check
results in a policy decision and by default a denial message on
insufficient permission is issued.
It can lead to three undesired cases:
1. A denial message is generated, even in case the operation was an
unprivileged one and thus the syscall succeeded, creating noise.
2. To avoid the noise from 1. the policy writer adds a rule to ignore
those denial messages, hiding future syscalls, where the task
performs an actual privileged operation, leading to hidden limited
functionality of that task.
3. To avoid the noise from 1. the policy writer adds a rule to allow
the task the capability CAP_SYS_NICE, while it does not need it,
violating the principle of least privilege.

Conduct privilged/unprivileged categorization first and perform a
capable test (and at most once) only if needed.

Signed-off-by: Christian Göttsche <[email protected]>

---
v2:
add is_nice_reduction() to avoid duplicate capable(CAP_SYS_NICE)
checks via can_nice()
---
kernel/sched/core.c | 135 +++++++++++++++++++++++++++-----------------
1 file changed, 84 insertions(+), 51 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d575b4914925..b9c1e67af46f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7211,6 +7211,86 @@ static bool check_same_owner(struct task_struct *p)
return match;
}

+/*
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
+ * @p: task
+ * @nice: nice value
+ */
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
+{
+ /* Convert nice value [19,-20] to rlimit style value [1,40]: */
+ int nice_rlim = nice_to_rlimit(nice);
+
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio =
+ task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
@@ -7252,58 +7332,11 @@ static int __sched_setscheduler(struct task_struct *p,
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;

- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;

--
2.36.0

2022-05-04 19:36:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] [RFC PATCH] sched: only perform capability check on privileged operation

On Mon, May 02, 2022 at 05:24:14PM +0200, Christian G?ttsche wrote:
> sched_setattr(2) issues via kernel/sched/core.c:__sched_setscheduler()
> a CAP_SYS_NICE audit event unconditionally, even when the requested
> operation does not require that capability / is unprivileged, i.e. for
> reducing niceness.
> This is relevant in connection with SELinux, where a capability check
> results in a policy decision and by default a denial message on
> insufficient permission is issued.
> It can lead to three undesired cases:
> 1. A denial message is generated, even in case the operation was an
> unprivileged one and thus the syscall succeeded, creating noise.
> 2. To avoid the noise from 1. the policy writer adds a rule to ignore
> those denial messages, hiding future syscalls, where the task
> performs an actual privileged operation, leading to hidden limited
> functionality of that task.
> 3. To avoid the noise from 1. the policy writer adds a rule to allow
> the task the capability CAP_SYS_NICE, while it does not need it,
> violating the principle of least privilege.
>
> Conduct privilged/unprivileged categorization first and perform a
> capable test (and at most once) only if needed.
>
> Signed-off-by: Christian G?ttsche <[email protected]>

Does something like so on top work?

---
kernel/sched/core.c | 41 +++++++++++++++++------------------------
1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ba5a9a1ce1e5..d3b5a2757c5f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6931,17 +6931,27 @@ void set_user_nice(struct task_struct *p, long nice)
EXPORT_SYMBOL(set_user_nice);

/*
- * can_nice - check if a task can reduce its nice value
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
* @p: task
* @nice: nice value
*/
-int can_nice(const struct task_struct *p, const int nice)
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
/* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);

- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}

#ifdef __ARCH_WANT_SYS_NICE
@@ -7220,22 +7230,6 @@ static bool check_same_owner(struct task_struct *p)
return match;
}

-/*
- * is_nice_reduction - check if nice value is an actual reduction
- *
- * Similar to can_nice() but does not perform a capability check.
- *
- * @p: task
- * @nice: nice value
- */
-static bool is_nice_reduction(const struct task_struct *p, const int nice)
-{
- /* Convert nice value [19,-20] to rlimit style value [1,40]: */
- int nice_rlim = nice_to_rlimit(nice);
-
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
-}
-
/*
* Allow unprivileged RT tasks to decrease priority.
* Only issue a capable test if needed and only once to avoid an audit
@@ -7247,13 +7241,12 @@ static int user_check_sched_setscheduler(struct task_struct *p,
{
if (fair_policy(policy)) {
if (attr->sched_nice < task_nice(p) &&
- !is_nice_reduction(p, attr->sched_nice))
+ !is_nice_reduction(p, attr->sched_nice))
goto req_priv;
}

if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);

/* Can't set/change the rt policy: */
if (policy != p->policy && !rlim_rtprio)
@@ -7261,7 +7254,7 @@ static int user_check_sched_setscheduler(struct task_struct *p,

/* Can't increase priority: */
if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
+ attr->sched_priority > rlim_rtprio)
goto req_priv;
}


2022-06-15 15:33:10

by Christian Göttsche

[permalink] [raw]
Subject: [PATCH v3] [RFC PATCH] sched: only perform capability check on privileged operation

sched_setattr(2) issues via kernel/sched/core.c:__sched_setscheduler()
a CAP_SYS_NICE audit event unconditionally, even when the requested
operation does not require that capability / is unprivileged, i.e. for
reducing niceness.
This is relevant in connection with SELinux, where a capability check
results in a policy decision and by default a denial message on
insufficient permission is issued.
It can lead to three undesired cases:
1. A denial message is generated, even in case the operation was an
unprivileged one and thus the syscall succeeded, creating noise.
2. To avoid the noise from 1. the policy writer adds a rule to ignore
those denial messages, hiding future syscalls, where the task
performs an actual privileged operation, leading to hidden limited
functionality of that task.
3. To avoid the noise from 1. the policy writer adds a rule to allow
the task the capability CAP_SYS_NICE, while it does not need it,
violating the principle of least privilege.

Conduct privilged/unprivileged categorization first and perform a
capable test (and at most once) only if needed.

Signed-off-by: Christian Göttsche <[email protected]>
---
v3:
incorporate feedback from Peter Zijlstra
- use is_nice_reduction() in can_nice()
- adjust indentation
v2:
add is_nice_reduction() to avoid duplicate capable(CAP_SYS_NICE)
checks via can_nice()
---
kernel/sched/core.c | 138 ++++++++++++++++++++++++++------------------
1 file changed, 83 insertions(+), 55 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bfa7452ca92e..c3647db8872d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6968,17 +6968,29 @@ void set_user_nice(struct task_struct *p, long nice)
EXPORT_SYMBOL(set_user_nice);

/*
- * can_nice - check if a task can reduce its nice value
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
* @p: task
* @nice: nice value
*/
-int can_nice(const struct task_struct *p, const int nice)
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
/* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);

- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}

#ifdef __ARCH_WANT_SYS_NICE
@@ -7257,6 +7269,69 @@ static bool check_same_owner(struct task_struct *p)
return match;
}

+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
@@ -7298,58 +7373,11 @@ static int __sched_setscheduler(struct task_struct *p,
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;

- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;

--
2.36.1

Subject: [tip: sched/core] sched: only perform capability check on privileged operation

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 700a78335fc28a59c307f420857fd2d4521549f8
Gitweb: https://git.kernel.org/tip/700a78335fc28a59c307f420857fd2d4521549f8
Author: Christian Göttsche <[email protected]>
AuthorDate: Wed, 15 Jun 2022 17:25:04 +02:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 28 Jun 2022 09:08:29 +02:00

sched: only perform capability check on privileged operation

sched_setattr(2) issues via kernel/sched/core.c:__sched_setscheduler()
a CAP_SYS_NICE audit event unconditionally, even when the requested
operation does not require that capability / is unprivileged, i.e. for
reducing niceness.
This is relevant in connection with SELinux, where a capability check
results in a policy decision and by default a denial message on
insufficient permission is issued.
It can lead to three undesired cases:
1. A denial message is generated, even in case the operation was an
unprivileged one and thus the syscall succeeded, creating noise.
2. To avoid the noise from 1. the policy writer adds a rule to ignore
those denial messages, hiding future syscalls, where the task
performs an actual privileged operation, leading to hidden limited
functionality of that task.
3. To avoid the noise from 1. the policy writer adds a rule to allow
the task the capability CAP_SYS_NICE, while it does not need it,
violating the principle of least privilege.

Conduct privilged/unprivileged categorization first and perform a
capable test (and at most once) only if needed.

Signed-off-by: Christian Göttsche <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/core.c | 138 +++++++++++++++++++++++++------------------
1 file changed, 83 insertions(+), 55 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7234526..d3e2c5a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6974,17 +6974,29 @@ out_unlock:
EXPORT_SYMBOL(set_user_nice);

/*
- * can_nice - check if a task can reduce its nice value
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
* @p: task
* @nice: nice value
*/
-int can_nice(const struct task_struct *p, const int nice)
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
/* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);

- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}

#ifdef __ARCH_WANT_SYS_NICE
@@ -7263,6 +7275,69 @@ static bool check_same_owner(struct task_struct *p)
return match;
}

+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
@@ -7304,58 +7379,11 @@ recheck:
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;

- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;