LinuxLists.cc - [PATCH 0/2] Two small fixes for bandwidth controller

2020-04-20 02:46:06

Subject: [PATCH 0/2] Two small fixes for bandwidth controller

This series contains two small fixes for bandwidth controller, and also
preparation for burstable CFS bandwidth controller.

Huaixin Chang (2):
sched: Defend cfs and rt bandwidth quota against overflow
sched/fair: Refill bandwidth before scaling

kernel/sched/core.c | 8 ++++++++
kernel/sched/fair.c | 4 ++--
kernel/sched/rt.c | 9 +++++++++
kernel/sched/sched.h | 2 ++
4 files changed, 21 insertions(+), 2 deletions(-)

--
2.14.4.44.g2045bb6

2020-04-20 02:46:45

by changhuaixin

[permalink] [raw]

Subject: [PATCH 2/2] sched/fair: Refill bandwidth before scaling

In order to prevent possible hardlockup of sched_cfs_period_timer()
loop, loop count is introduced to denote whether to scale quota and
period or not. However, scale is done between forwarding period timer
and refilling cfs bandwidth runtime, which means that period timer is
forwarded with old "period" while runtime is refilled with scaled
"quota".

Move do_sched_cfs_period_timer() before scaling to solve this.

Fixes: 2e8e19226398 ("sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup")
Signed-off-by: Huaixin Chang <[email protected]>
---
kernel/sched/fair.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 02f323b85b6d..9ace1c5c73a5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5152,6 +5152,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;

+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);

@@ -5181,8 +5183,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
/* reset count so we don't come right back in here */
count = 0;
}
-
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
--
2.14.4.44.g2045bb6

2020-04-20 02:47:20

by changhuaixin

[permalink] [raw]

Subject: [PATCH 1/2] sched: Defend cfs and rt bandwidth quota against overflow

Kernel limitation on cpu.cfs_quota_us is insufficient. Some large
numbers might cause overflow in to_ratio() calculation and produce
unexpected results.

For example, if we make two cpu cgroups and then write a reasonable
value and a large value into child's and parent's cpu.cfs_quota_us. This
will cause a write error.

cd /sys/fs/cgroup/cpu
mkdir parent; mkdir parent/child
echo 8000 > parent/child/cpu.cfs_quota_us
# 17592186044416 is (1UL << 44)
echo 17592186044416 > parent/cpu.cfs_quota_us

In this case, quota will overflow and thus fail the __cfs_schedulable
check. Similar overflow also affects rt bandwidth.

Signed-off-by: Huaixin Chang <[email protected]>
---
kernel/sched/core.c | 8 ++++++++
kernel/sched/rt.c | 9 +++++++++
kernel/sched/sched.h | 2 ++
3 files changed, 19 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3a61a3b8eaa9..f0a74e35c3f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7390,6 +7390,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);

const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW_USEC * NSEC_PER_USEC;

static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);

@@ -7417,6 +7419,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (period > max_cfs_quota_period)
return -EINVAL;

+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+ return -EINVAL;
+
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index df11d88c9895..f5eea19d68c4 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2569,6 +2569,9 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
return ret;
}

+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW_USEC * NSEC_PER_USEC;
+
static int tg_set_rt_bandwidth(struct task_group *tg,
u64 rt_period, u64 rt_runtime)
{
@@ -2585,6 +2588,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
if (rt_period == 0)
return -EINVAL;

+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
err = __rt_schedulable(tg, rt_period, rt_runtime);
if (err)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db3a57675ccf..6f6b7f545557 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1918,6 +1918,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
+#define MAX_BW_BITS (64 - BW_SHIFT)
+#define MAX_BW_USEC ((1UL << MAX_BW_BITS) - 1)
unsigned long to_ratio(u64 period, u64 runtime);

extern void init_entity_runnable_average(struct sched_entity *se);
--
2.14.4.44.g2045bb6

2020-04-20 17:51:34

by Benjamin Segall

[permalink] [raw]

Subject: Re: [PATCH 1/2] sched: Defend cfs and rt bandwidth quota against overflow

Huaixin Chang <[email protected]> writes:

> Kernel limitation on cpu.cfs_quota_us is insufficient. Some large
> numbers might cause overflow in to_ratio() calculation and produce
> unexpected results.
>
> For example, if we make two cpu cgroups and then write a reasonable
> value and a large value into child's and parent's cpu.cfs_quota_us. This
> will cause a write error.
>
> cd /sys/fs/cgroup/cpu
> mkdir parent; mkdir parent/child
> echo 8000 > parent/child/cpu.cfs_quota_us
> # 17592186044416 is (1UL << 44)
> echo 17592186044416 > parent/cpu.cfs_quota_us
>
> In this case, quota will overflow and thus fail the __cfs_schedulable
> check. Similar overflow also affects rt bandwidth.

More to the point is that I think doing

echo 17592186044416 > parent/cpu.cfs_quota_us
echo 8000 > parent/child/cpu.cfs_quota_us

will only fail on the second write, while with this patch it will fail
on the first, which should be more understandable.

to_ratio could be altered to avoid unnecessary internal overflow, but
min_cfs_quota_period is less than 1<<BW_SHIFT, so a cutoff would still
be needed.

Also tg_rt_schedulable sums a bunch of to_ratio(), and doesn't check for
overflow on that sum, so if we consider preventing weirdness around
schedulable checks and max quotas relevant we should probably fix that too.

>
> Signed-off-by: Huaixin Chang <[email protected]>
> ---
> kernel/sched/core.c | 8 ++++++++
> kernel/sched/rt.c | 9 +++++++++
> kernel/sched/sched.h | 2 ++
> 3 files changed, 19 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3a61a3b8eaa9..f0a74e35c3f0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7390,6 +7390,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
>
> const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
> static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
> +/* More than 203 days if BW_SHIFT equals 20. */
> +static const u64 max_cfs_runtime = MAX_BW_USEC * NSEC_PER_USEC;
>
> static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
>
> @@ -7417,6 +7419,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
> if (period > max_cfs_quota_period)
> return -EINVAL;
>
> + /*
> + * Bound quota to defend quota against overflow during bandwidth shift.
> + */
> + if (quota != RUNTIME_INF && quota > max_cfs_runtime)
> + return -EINVAL;
> +
> /*
> * Prevent race between setting of cfs_rq->runtime_enabled and
> * unthrottle_offline_cfs_rqs().
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index df11d88c9895..f5eea19d68c4 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -2569,6 +2569,9 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
> return ret;
> }
>
> +/* More than 203 days if BW_SHIFT equals 20. */
> +static const u64 max_rt_runtime = MAX_BW_USEC * NSEC_PER_USEC;

It looks to me like __rt_schedulable doesn't divide by NSEC_PER_USEC, so
to_ratio is operating on nsec, and the limit is in nsec, and MAX_BW_USEC
should probably not be named USEC then as well.

> +
> static int tg_set_rt_bandwidth(struct task_group *tg,
> u64 rt_period, u64 rt_runtime)
> {
> @@ -2585,6 +2588,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
> if (rt_period == 0)
> return -EINVAL;
>
> + /*
> + * Bound quota to defend quota against overflow during bandwidth shift.
> + */
> + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
> + return -EINVAL;
> +
> mutex_lock(&rt_constraints_mutex);
> err = __rt_schedulable(tg, rt_period, rt_runtime);
> if (err)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index db3a57675ccf..6f6b7f545557 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1918,6 +1918,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
> #define BW_SHIFT 20
> #define BW_UNIT (1 << BW_SHIFT)
> #define RATIO_SHIFT 8
> +#define MAX_BW_BITS (64 - BW_SHIFT)
> +#define MAX_BW_USEC ((1UL << MAX_BW_BITS) - 1)
> unsigned long to_ratio(u64 period, u64 runtime);
>
> extern void init_entity_runnable_average(struct sched_entity *se);

2020-04-20 18:56:32

by Benjamin Segall

[permalink] [raw]

Subject: Re: [PATCH 2/2] sched/fair: Refill bandwidth before scaling

Huaixin Chang <[email protected]> writes:

> In order to prevent possible hardlockup of sched_cfs_period_timer()
> loop, loop count is introduced to denote whether to scale quota and
> period or not. However, scale is done between forwarding period timer
> and refilling cfs bandwidth runtime, which means that period timer is
> forwarded with old "period" while runtime is refilled with scaled
> "quota".
>
> Move do_sched_cfs_period_timer() before scaling to solve this.

Reviewed-by: Ben Segall <[email protected]>

>
> Fixes: 2e8e19226398 ("sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup")
> Signed-off-by: Huaixin Chang <[email protected]>
> ---
> kernel/sched/fair.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 02f323b85b6d..9ace1c5c73a5 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5152,6 +5152,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> if (!overrun)
> break;
>
> + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
> +
> if (++count > 3) {
> u64 new, old = ktime_to_ns(cfs_b->period);
>
> @@ -5181,8 +5183,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> /* reset count so we don't come right back in here */
> count = 0;
> }
> -
> - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
> }
> if (idle)
> cfs_b->period_active = 0;

2020-04-21 15:14:16

by Phil Auld

[permalink] [raw]

Subject: Re: [PATCH 2/2] sched/fair: Refill bandwidth before scaling

Hi,

On Mon, Apr 20, 2020 at 10:44:21AM +0800 Huaixin Chang wrote:
> In order to prevent possible hardlockup of sched_cfs_period_timer()
> loop, loop count is introduced to denote whether to scale quota and
> period or not. However, scale is done between forwarding period timer
> and refilling cfs bandwidth runtime, which means that period timer is
> forwarded with old "period" while runtime is refilled with scaled
> "quota".
>
> Move do_sched_cfs_period_timer() before scaling to solve this.
>
> Fixes: 2e8e19226398 ("sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup")
> Signed-off-by: Huaixin Chang <[email protected]>
> ---
> kernel/sched/fair.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 02f323b85b6d..9ace1c5c73a5 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5152,6 +5152,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> if (!overrun)
> break;
>
> + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
> +
> if (++count > 3) {
> u64 new, old = ktime_to_ns(cfs_b->period);
>
> @@ -5181,8 +5183,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
> /* reset count so we don't come right back in here */
> count = 0;
> }
> -
> - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
> }
> if (idle)
> cfs_b->period_active = 0;
> --
> 2.14.4.44.g2045bb6
>

This one is independent of the first so could be taken as is.

Reviewed-by: Phil Auld <[email protected]>
--

2020-04-22 03:40:11

by changhuaixin

[permalink] [raw]

Subject: Re: [PATCH 1/2] sched: Defend cfs and rt bandwidth quota against overflow

> ?? 2020??4??21?գ?????1:[email protected] д????
>
> Huaixin Chang <[email protected]> writes:
>
>> Kernel limitation on cpu.cfs_quota_us is insufficient. Some large
>> numbers might cause overflow in to_ratio() calculation and produce
>> unexpected results.
>>
>> For example, if we make two cpu cgroups and then write a reasonable
>> value and a large value into child's and parent's cpu.cfs_quota_us. This
>> will cause a write error.
>>
>> cd /sys/fs/cgroup/cpu
>> mkdir parent; mkdir parent/child
>> echo 8000 > parent/child/cpu.cfs_quota_us
>> # 17592186044416 is (1UL << 44)
>> echo 17592186044416 > parent/cpu.cfs_quota_us
>>
>> In this case, quota will overflow and thus fail the __cfs_schedulable
>> check. Similar overflow also affects rt bandwidth.
>
> More to the point is that I think doing
>
> echo 17592186044416 > parent/cpu.cfs_quota_us
> echo 8000 > parent/child/cpu.cfs_quota_us
>
> will only fail on the second write, while with this patch it will fail
> on the first, which should be more understandable.
>
>
> to_ratio could be altered to avoid unnecessary internal overflow, but
> min_cfs_quota_period is less than 1<<BW_SHIFT, so a cutoff would still
> be needed.
>

Yes, I will rewrite commit log in the following patch.

> Also tg_rt_schedulable sums a bunch of to_ratio(), and doesn't check for
> overflow on that sum, so if we consider preventing weirdness around
> schedulable checks and max quotas relevant we should probably fix that too.
>

It seems to me that check for overflow on sum of to_ratio(rt_period, rt_runtime) is not necessary. As to_ratio() of a rt group is bounded by global_rt_period() and global_rt_runtime() due to the checks in tg_rt_schedulable(). And global_rt_runtime() is not allowed to be greater than global_rt_period() thanks to sched_rt_global_validate(). Thus, to_ratio() of a rt group will not exceed BW_UNIT, sum of which is unlikely to overflow then. Checks against rt_runtime overflow during to_ratio is still needed.

Is that correct?

>>
>> Signed-off-by: Huaixin Chang <[email protected]>
>> ---
>> kernel/sched/core.c | 8 ++++++++
>> kernel/sched/rt.c | 9 +++++++++
>> kernel/sched/sched.h | 2 ++
>> 3 files changed, 19 insertions(+)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 3a61a3b8eaa9..f0a74e35c3f0 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -7390,6 +7390,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
>>
>> const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
>> static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
>> +/* More than 203 days if BW_SHIFT equals 20. */
>> +static const u64 max_cfs_runtime = MAX_BW_USEC * NSEC_PER_USEC;
>>
>> static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
>>
>> @@ -7417,6 +7419,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
>> if (period > max_cfs_quota_period)
>> return -EINVAL;
>>
>> + /*
>> + * Bound quota to defend quota against overflow during bandwidth shift.
>> + */
>> + if (quota != RUNTIME_INF && quota > max_cfs_runtime)
>> + return -EINVAL;
>> +
>> /*
>> * Prevent race between setting of cfs_rq->runtime_enabled and
>> * unthrottle_offline_cfs_rqs().
>> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>> index df11d88c9895..f5eea19d68c4 100644
>> --- a/kernel/sched/rt.c
>> +++ b/kernel/sched/rt.c
>> @@ -2569,6 +2569,9 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
>> return ret;
>> }
>>
>> +/* More than 203 days if BW_SHIFT equals 20. */
>> +static const u64 max_rt_runtime = MAX_BW_USEC * NSEC_PER_USEC;
>
> It looks to me like __rt_schedulable doesn't divide by NSEC_PER_USEC, so
> to_ratio is operating on nsec, and the limit is in nsec, and MAX_BW_USEC
> should probably not be named USEC then as well.

Yes, the limit for rt_runtime is in nsec. This should be changed.

>
>> +
>> static int tg_set_rt_bandwidth(struct task_group *tg,
>> u64 rt_period, u64 rt_runtime)
>> {
>> @@ -2585,6 +2588,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
>> if (rt_period == 0)
>> return -EINVAL;
>>
>> + /*
>> + * Bound quota to defend quota against overflow during bandwidth shift.
>> + */
>> + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
>> + return -EINVAL;
>> +
>> mutex_lock(&rt_constraints_mutex);
>> err = __rt_schedulable(tg, rt_period, rt_runtime);
>> if (err)
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index db3a57675ccf..6f6b7f545557 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -1918,6 +1918,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
>> #define BW_SHIFT 20
>> #define BW_UNIT (1 << BW_SHIFT)
>> #define RATIO_SHIFT 8
>> +#define MAX_BW_BITS (64 - BW_SHIFT)
>> +#define MAX_BW_USEC ((1UL << MAX_BW_BITS) - 1)
>> unsigned long to_ratio(u64 period, u64 runtime);
>>
>> extern void init_entity_runnable_average(struct sched_entity *se);

2020-04-22 08:40:53

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH 1/2] sched: Defend cfs and rt bandwidth quota against overflow

Hi Huaixin,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on tip/sched/core]
[also build test WARNING on tip/auto-latest linus/master linux/master v5.7-rc2 next-20200421]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url: https://github.com/0day-ci/linux/commits/Huaixin-Chang/Two-small-fixes-for-bandwidth-controller/20200421-230027
base: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 8f3d9f354286745c751374f5f1fcafee6b3f3136
config: sh-allmodconfig (attached as .config)
compiler: sh4-linux-gcc (GCC) 9.3.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day GCC_VERSION=9.3.0 make.cross ARCH=sh

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <[email protected]>

All warnings (new ones prefixed by >>):

In file included from kernel/sched/core.c:9:
>> kernel/sched/sched.h:1922:28: warning: left shift count >= width of type [-Wshift-count-overflow]
1922 | #define MAX_BW_USEC ((1UL << MAX_BW_BITS) - 1)
| ^~
>> kernel/sched/core.c:7394:36: note: in expansion of macro 'MAX_BW_USEC'
7394 | static const u64 max_cfs_runtime = MAX_BW_USEC * NSEC_PER_USEC;
| ^~~~~~~~~~~
--
In file included from kernel/sched/rt.c:6:
>> kernel/sched/sched.h:1922:28: warning: left shift count >= width of type [-Wshift-count-overflow]
1922 | #define MAX_BW_USEC ((1UL << MAX_BW_BITS) - 1)
| ^~
>> kernel/sched/rt.c:2573:35: note: in expansion of macro 'MAX_BW_USEC'
2573 | static const u64 max_rt_runtime = MAX_BW_USEC * NSEC_PER_USEC;
| ^~~~~~~~~~~

vim +1922 kernel/sched/sched.h

1917
1918 #define BW_SHIFT 20
1919 #define BW_UNIT (1 << BW_SHIFT)
1920 #define RATIO_SHIFT 8
1921 #define MAX_BW_BITS (64 - BW_SHIFT)
> 1922 #define MAX_BW_USEC ((1UL << MAX_BW_BITS) - 1)
1923 unsigned long to_ratio(u64 period, u64 runtime);
1924

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]

Attachments:

(No filename) (2.52 kB)
.config.gz (53.37 kB)
Download all attachments

2020-04-22 18:46:52

by Benjamin Segall

[permalink] [raw]

Subject: Re: [PATCH 1/2] sched: Defend cfs and rt bandwidth quota against overflow

changhuaixin <[email protected]> writes:

>> 在 2020年4月21日，上午1:50，[email protected] 写道：
>>
>> Huaixin Chang <[email protected]> writes:
>>
>>> Kernel limitation on cpu.cfs_quota_us is insufficient. Some large
>>> numbers might cause overflow in to_ratio() calculation and produce
>>> unexpected results.
>>>
>>> For example, if we make two cpu cgroups and then write a reasonable
>>> value and a large value into child's and parent's cpu.cfs_quota_us. This
>>> will cause a write error.
>>>
>>> cd /sys/fs/cgroup/cpu
>>> mkdir parent; mkdir parent/child
>>> echo 8000 > parent/child/cpu.cfs_quota_us
>>> # 17592186044416 is (1UL << 44)
>>> echo 17592186044416 > parent/cpu.cfs_quota_us
>>>
>>> In this case, quota will overflow and thus fail the __cfs_schedulable
>>> check. Similar overflow also affects rt bandwidth.
>>
>> More to the point is that I think doing
>>
>> echo 17592186044416 > parent/cpu.cfs_quota_us
>> echo 8000 > parent/child/cpu.cfs_quota_us
>>
>> will only fail on the second write, while with this patch it will fail
>> on the first, which should be more understandable.
>>
>>
>> to_ratio could be altered to avoid unnecessary internal overflow, but
>> min_cfs_quota_period is less than 1<<BW_SHIFT, so a cutoff would still
>> be needed.
>>
>
> Yes, I will rewrite commit log in the following patch.
>
>> Also tg_rt_schedulable sums a bunch of to_ratio(), and doesn't check for
>> overflow on that sum, so if we consider preventing weirdness around
>> schedulable checks and max quotas relevant we should probably fix that too.
>>
>
> It seems to me that check for overflow on sum of to_ratio(rt_period, rt_runtime)
> is not necessary. As to_ratio() of a rt group is bounded by global_rt_period()
> and global_rt_runtime() due to the checks in tg_rt_schedulable(). And
> global_rt_runtime() is not allowed to be greater than global_rt_period() thanks
> to sched_rt_global_validate(). Thus, to_ratio() of a rt group will not exceed
> BW_UNIT, sum of which is unlikely to overflow then. Checks against rt_runtime
> overflow during to_ratio is still needed.
>
> Is that correct?

Good point, that's probably not a problem then.

>
>>>
>>> Signed-off-by: Huaixin Chang <[email protected]>
>>> ---
>>> kernel/sched/core.c | 8 ++++++++
>>> kernel/sched/rt.c | 9 +++++++++
>>> kernel/sched/sched.h | 2 ++
>>> 3 files changed, 19 insertions(+)
>>>
>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>> index 3a61a3b8eaa9..f0a74e35c3f0 100644
>>> --- a/kernel/sched/core.c
>>> +++ b/kernel/sched/core.c
>>> @@ -7390,6 +7390,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
>>>
>>> const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
>>> static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
>>> +/* More than 203 days if BW_SHIFT equals 20. */
>>> +static const u64 max_cfs_runtime = MAX_BW_USEC * NSEC_PER_USEC;
>>>
>>> static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
>>>
>>> @@ -7417,6 +7419,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
>>> if (period > max_cfs_quota_period)
>>> return -EINVAL;
>>>
>>> + /*
>>> + * Bound quota to defend quota against overflow during bandwidth shift.
>>> + */
>>> + if (quota != RUNTIME_INF && quota > max_cfs_runtime)
>>> + return -EINVAL;
>>> +
>>> /*
>>> * Prevent race between setting of cfs_rq->runtime_enabled and
>>> * unthrottle_offline_cfs_rqs().
>>> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>>> index df11d88c9895..f5eea19d68c4 100644
>>> --- a/kernel/sched/rt.c
>>> +++ b/kernel/sched/rt.c
>>> @@ -2569,6 +2569,9 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
>>> return ret;
>>> }
>>>
>>> +/* More than 203 days if BW_SHIFT equals 20. */
>>> +static const u64 max_rt_runtime = MAX_BW_USEC * NSEC_PER_USEC;
>>
>> It looks to me like __rt_schedulable doesn't divide by NSEC_PER_USEC, so
>> to_ratio is operating on nsec, and the limit is in nsec, and MAX_BW_USEC
>> should probably not be named USEC then as well.
>
> Yes, the limit for rt_runtime is in nsec. This should be changed.
>
>>
>>> +
>>> static int tg_set_rt_bandwidth(struct task_group *tg,
>>> u64 rt_period, u64 rt_runtime)
>>> {
>>> @@ -2585,6 +2588,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
>>> if (rt_period == 0)
>>> return -EINVAL;
>>>
>>> + /*
>>> + * Bound quota to defend quota against overflow during bandwidth shift.
>>> + */
>>> + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
>>> + return -EINVAL;
>>> +
>>> mutex_lock(&rt_constraints_mutex);
>>> err = __rt_schedulable(tg, rt_period, rt_runtime);
>>> if (err)
>>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>>> index db3a57675ccf..6f6b7f545557 100644
>>> --- a/kernel/sched/sched.h
>>> +++ b/kernel/sched/sched.h
>>> @@ -1918,6 +1918,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
>>> #define BW_SHIFT 20
>>> #define BW_UNIT (1 << BW_SHIFT)
>>> #define RATIO_SHIFT 8
>>> +#define MAX_BW_BITS (64 - BW_SHIFT)
>>> +#define MAX_BW_USEC ((1UL << MAX_BW_BITS) - 1)
>>> unsigned long to_ratio(u64 period, u64 runtime);
>>>
>>> extern void init_entity_runnable_average(struct sched_entity *se);

2020-04-23 13:41:18

by changhuaixin

[permalink] [raw]

Subject: [PATCH] sched: Defend cfs and rt bandwidth quota against overflow

When users write some huge number into cpu.cfs_quota_us or
cpu.rt_runtime_us, overflow might happen during to_ratio() shifts of
schedulable checks.

to_ratio() could be altered to avoid unnecessary internal overflow, but
min_cfs_quota_period is less than 1 << BW_SHIFT, so a cutoff would still
be needed. Set a cap MAX_BW for cfs_quota_us and rt_runtime_us to
prevent overflow.

Signed-off-by: Huaixin Chang <[email protected]>
---
kernel/sched/core.c | 8 ++++++++
kernel/sched/rt.c | 9 +++++++++
kernel/sched/sched.h | 2 ++
3 files changed, 19 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3a61a3b8eaa9..0be1782e15c9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7390,6 +7390,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);

const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;

static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);

@@ -7417,6 +7419,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (period > max_cfs_quota_period)
return -EINVAL;

+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+ return -EINVAL;
+
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index df11d88c9895..7ba49625cdbd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2569,6 +2569,9 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
return ret;
}

+/* More than 4 hours if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW;
+
static int tg_set_rt_bandwidth(struct task_group *tg,
u64 rt_period, u64 rt_runtime)
{
@@ -2585,6 +2588,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
if (rt_period == 0)
return -EINVAL;

+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
err = __rt_schedulable(tg, rt_period, rt_runtime);
if (err)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db3a57675ccf..1f58677a8f23 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1918,6 +1918,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
+#define MAX_BW_BITS (64 - BW_SHIFT)
+#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
unsigned long to_ratio(u64 period, u64 runtime);

extern void init_entity_runnable_average(struct sched_entity *se);
--
2.14.4.44.g2045bb6

2020-04-23 20:37:33

by Benjamin Segall

[permalink] [raw]

Subject: Re: [PATCH] sched: Defend cfs and rt bandwidth quota against overflow

Huaixin Chang <[email protected]> writes:

> When users write some huge number into cpu.cfs_quota_us or
> cpu.rt_runtime_us, overflow might happen during to_ratio() shifts of
> schedulable checks.
>
> to_ratio() could be altered to avoid unnecessary internal overflow, but
> min_cfs_quota_period is less than 1 << BW_SHIFT, so a cutoff would still
> be needed. Set a cap MAX_BW for cfs_quota_us and rt_runtime_us to
> prevent overflow.
>
> Signed-off-by: Huaixin Chang <[email protected]>
> ---
> kernel/sched/core.c | 8 ++++++++
> kernel/sched/rt.c | 9 +++++++++
> kernel/sched/sched.h | 2 ++
> 3 files changed, 19 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3a61a3b8eaa9..0be1782e15c9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7390,6 +7390,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
>
> const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
> static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
> +/* More than 203 days if BW_SHIFT equals 20. */
> +static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
>
> static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
>
> @@ -7417,6 +7419,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
> if (period > max_cfs_quota_period)
> return -EINVAL;
>
> + /*
> + * Bound quota to defend quota against overflow during bandwidth shift.
> + */
> + if (quota != RUNTIME_INF && quota > max_cfs_runtime)
> + return -EINVAL;
> +
> /*
> * Prevent race between setting of cfs_rq->runtime_enabled and
> * unthrottle_offline_cfs_rqs().
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index df11d88c9895..7ba49625cdbd 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -2569,6 +2569,9 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
> return ret;
> }
>
> +/* More than 4 hours if BW_SHIFT equals 20. */
> +static const u64 max_rt_runtime = MAX_BW;
> +
> static int tg_set_rt_bandwidth(struct task_group *tg,
> u64 rt_period, u64 rt_runtime)
> {
> @@ -2585,6 +2588,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
> if (rt_period == 0)
> return -EINVAL;
>
> + /*
> + * Bound quota to defend quota against overflow during bandwidth shift.
> + */
> + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
> + return -EINVAL;
> +

We probably _do_ also want this in sched_rt_global_validate now that I
think of it. Other than missing that, it looks good.

> mutex_lock(&rt_constraints_mutex);
> err = __rt_schedulable(tg, rt_period, rt_runtime);
> if (err)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index db3a57675ccf..1f58677a8f23 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1918,6 +1918,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
> #define BW_SHIFT 20
> #define BW_UNIT (1 << BW_SHIFT)
> #define RATIO_SHIFT 8
> +#define MAX_BW_BITS (64 - BW_SHIFT)
> +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
> unsigned long to_ratio(u64 period, u64 runtime);
>
> extern void init_entity_runnable_average(struct sched_entity *se);

2020-04-24 06:38:04

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH 1/2] sched: Defend cfs and rt bandwidth quota against overflow

Hi Huaixin,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/auto-latest linus/master v5.7-rc2 next-20200423]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url: https://github.com/0day-ci/linux/commits/Huaixin-Chang/Two-small-fixes-for-bandwidth-controller/20200421-230027
base: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 8f3d9f354286745c751374f5f1fcafee6b3f3136
config: i386-randconfig-a001-20200424 (attached as .config)
compiler: gcc-4.9 (Ubuntu 4.9.3-13ubuntu2) 4.9.3
reproduce:
# save the attached .config to linux build tree
make ARCH=i386

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <[email protected]>

All error/warnings (new ones prefixed by >>):

>> kernel/sched/core.c:7394:1: warning: left shift count >= width of type
static const u64 max_cfs_runtime = MAX_BW_USEC * NSEC_PER_USEC;
^
>> kernel/sched/core.c:7394:1: error: initializer element is not computable at load time

vim +7394 kernel/sched/core.c

7390
7391 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7392 static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
7393 /* More than 203 days if BW_SHIFT equals 20. */
> 7394 static const u64 max_cfs_runtime = MAX_BW_USEC * NSEC_PER_USEC;
7395

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]

Attachments:

(No filename) (1.68 kB)
.config.gz (34.69 kB)
Download all attachments

2020-04-25 10:55:11

by changhuaixin

[permalink] [raw]

Subject: [PATCH v2] sched: Defend cfs and rt bandwidth quota against overflow

When users write some huge number into cpu.cfs_quota_us or
cpu.rt_runtime_us, overflow might happen during to_ratio() shifts of
schedulable checks.

to_ratio() could be altered to avoid unnecessary internal overflow, but
min_cfs_quota_period is less than 1 << BW_SHIFT, so a cutoff would still
be needed. Set a cap MAX_BW for cfs_quota_us and rt_runtime_us to
prevent overflow.

Signed-off-by: Huaixin Chang <[email protected]>
---
kernel/sched/core.c | 8 ++++++++
kernel/sched/rt.c | 12 +++++++++++-
kernel/sched/sched.h | 2 ++
3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3a61a3b8eaa9..0be1782e15c9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7390,6 +7390,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);

const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;

static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);

@@ -7417,6 +7419,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (period > max_cfs_quota_period)
return -EINVAL;

+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+ return -EINVAL;
+
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index df11d88c9895..6d60ba21ed29 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,8 @@

int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+/* More than 4 hours if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW;

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

@@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
if (rt_period == 0)
return -EINVAL;

+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
err = __rt_schedulable(tg, rt_period, rt_runtime);
if (err)
@@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void)
return -EINVAL;

if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
+ ((u64)sysctl_sched_rt_runtime *
+ NSEC_PER_USEC > max_rt_runtime)))
return -EINVAL;

return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db3a57675ccf..1f58677a8f23 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1918,6 +1918,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
+#define MAX_BW_BITS (64 - BW_SHIFT)
+#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
unsigned long to_ratio(u64 period, u64 runtime);

extern void init_entity_runnable_average(struct sched_entity *se);
--
2.14.4.44.g2045bb6

2020-04-27 18:33:47

by Benjamin Segall

[permalink] [raw]

Subject: Re: [PATCH v2] sched: Defend cfs and rt bandwidth quota against overflow

Huaixin Chang <[email protected]> writes:

> When users write some huge number into cpu.cfs_quota_us or
> cpu.rt_runtime_us, overflow might happen during to_ratio() shifts of
> schedulable checks.
>
> to_ratio() could be altered to avoid unnecessary internal overflow, but
> min_cfs_quota_period is less than 1 << BW_SHIFT, so a cutoff would still
> be needed. Set a cap MAX_BW for cfs_quota_us and rt_runtime_us to
> prevent overflow.

Reviewed-by: Ben Segall <[email protected]>

>
> Signed-off-by: Huaixin Chang <[email protected]>
> ---
> kernel/sched/core.c | 8 ++++++++
> kernel/sched/rt.c | 12 +++++++++++-
> kernel/sched/sched.h | 2 ++
> 3 files changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3a61a3b8eaa9..0be1782e15c9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7390,6 +7390,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
>
> const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
> static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
> +/* More than 203 days if BW_SHIFT equals 20. */
> +static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
>
> static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
>
> @@ -7417,6 +7419,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
> if (period > max_cfs_quota_period)
> return -EINVAL;
>
> + /*
> + * Bound quota to defend quota against overflow during bandwidth shift.
> + */
> + if (quota != RUNTIME_INF && quota > max_cfs_runtime)
> + return -EINVAL;
> +
> /*
> * Prevent race between setting of cfs_rq->runtime_enabled and
> * unthrottle_offline_cfs_rqs().
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index df11d88c9895..6d60ba21ed29 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -9,6 +9,8 @@
>
> int sched_rr_timeslice = RR_TIMESLICE;
> int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
> +/* More than 4 hours if BW_SHIFT equals 20. */
> +static const u64 max_rt_runtime = MAX_BW;
>
> static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
>
> @@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
> if (rt_period == 0)
> return -EINVAL;
>
> + /*
> + * Bound quota to defend quota against overflow during bandwidth shift.
> + */
> + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
> + return -EINVAL;
> +
> mutex_lock(&rt_constraints_mutex);
> err = __rt_schedulable(tg, rt_period, rt_runtime);
> if (err)
> @@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void)
> return -EINVAL;
>
> if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
> - (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
> + ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
> + ((u64)sysctl_sched_rt_runtime *
> + NSEC_PER_USEC > max_rt_runtime)))
> return -EINVAL;
>
> return 0;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index db3a57675ccf..1f58677a8f23 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1918,6 +1918,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
> #define BW_SHIFT 20
> #define BW_UNIT (1 << BW_SHIFT)
> #define RATIO_SHIFT 8
> +#define MAX_BW_BITS (64 - BW_SHIFT)
> +#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
> unsigned long to_ratio(u64 period, u64 runtime);
>
> extern void init_entity_runnable_average(struct sched_entity *se);

2020-05-01 18:26:28

by tip-bot2 for Tony Luck

[permalink] [raw]

Subject: [tip: sched/core] sched/fair: Refill bandwidth before scaling

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 5a6d6a6ccb5f48ca8cf7c6d64ff83fd9c7999390
Gitweb: https://git.kernel.org/tip/5a6d6a6ccb5f48ca8cf7c6d64ff83fd9c7999390
Author: Huaixin Chang <[email protected]>
AuthorDate: Mon, 20 Apr 2020 10:44:21 +08:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 30 Apr 2020 20:14:40 +02:00

sched/fair: Refill bandwidth before scaling

In order to prevent possible hardlockup of sched_cfs_period_timer()
loop, loop count is introduced to denote whether to scale quota and
period or not. However, scale is done between forwarding period timer
and refilling cfs bandwidth runtime, which means that period timer is
forwarded with old "period" while runtime is refilled with scaled
"quota".

Move do_sched_cfs_period_timer() before scaling to solve this.

Fixes: 2e8e19226398 ("sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup")
Signed-off-by: Huaixin Chang <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Ben Segall <[email protected]>
Reviewed-by: Phil Auld <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c0216ef..fac5b2f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5159,6 +5159,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;

+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);

@@ -5188,8 +5190,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
/* reset count so we don't come right back in here */
count = 0;
}
-
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;

2020-05-11 13:06:57

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v2] sched: Defend cfs and rt bandwidth quota against overflow

On Mon, Apr 27, 2020 at 11:29:30AM -0700, [email protected] wrote:
> Huaixin Chang <[email protected]> writes:
>
> > When users write some huge number into cpu.cfs_quota_us or
> > cpu.rt_runtime_us, overflow might happen during to_ratio() shifts of
> > schedulable checks.
> >
> > to_ratio() could be altered to avoid unnecessary internal overflow, but
> > min_cfs_quota_period is less than 1 << BW_SHIFT, so a cutoff would still
> > be needed. Set a cap MAX_BW for cfs_quota_us and rt_runtime_us to
> > prevent overflow.
>
> Reviewed-by: Ben Segall <[email protected]>

Thanks!

2020-05-19 18:49:18

by tip-bot2 for Tony Luck

[permalink] [raw]

Subject: [tip: sched/core] sched: Defend cfs and rt bandwidth quota against overflow

The following commit has been merged into the sched/core branch of tip:

Commit-ID: d505b8af58912ae1e1a211fabc9995b19bd40828
Gitweb: https://git.kernel.org/tip/d505b8af58912ae1e1a211fabc9995b19bd40828
Author: Huaixin Chang <[email protected]>
AuthorDate: Sat, 25 Apr 2020 18:52:48 +08:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 19 May 2020 20:34:14 +02:00

sched: Defend cfs and rt bandwidth quota against overflow

When users write some huge number into cpu.cfs_quota_us or
cpu.rt_runtime_us, overflow might happen during to_ratio() shifts of
schedulable checks.

to_ratio() could be altered to avoid unnecessary internal overflow, but
min_cfs_quota_period is less than 1 << BW_SHIFT, so a cutoff would still
be needed. Set a cap MAX_BW for cfs_quota_us and rt_runtime_us to
prevent overflow.

Signed-off-by: Huaixin Chang <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Ben Segall <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/core.c | 8 ++++++++
kernel/sched/rt.c | 12 +++++++++++-
kernel/sched/sched.h | 2 ++
3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 74fb89b..fa905b6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7379,6 +7379,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);

const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;

static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);

@@ -7407,6 +7409,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
return -EINVAL;

/*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+ return -EINVAL;
+
+ /*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index df11d88..6d60ba2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,8 @@

int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+/* More than 4 hours if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW;

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

@@ -2585,6 +2587,12 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
if (rt_period == 0)
return -EINVAL;

+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
err = __rt_schedulable(tg, rt_period, rt_runtime);
if (err)
@@ -2702,7 +2710,9 @@ static int sched_rt_global_validate(void)
return -EINVAL;

if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+ ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
+ ((u64)sysctl_sched_rt_runtime *
+ NSEC_PER_USEC > max_rt_runtime)))
return -EINVAL;

return 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2bd2a22..f7ab633 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1915,6 +1915,8 @@ extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
+#define MAX_BW_BITS (64 - BW_SHIFT)
+#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
unsigned long to_ratio(u64 period, u64 runtime);

extern void init_entity_runnable_average(struct sched_entity *se);