Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752150AbeADE0O (ORCPT + 1 other); Wed, 3 Jan 2018 23:26:14 -0500 Received: from mail.kernel.org ([198.145.29.99]:49230 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752011AbeADE0M (ORCPT ); Wed, 3 Jan 2018 23:26:12 -0500 DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org F0A4321927 Authentication-Results: mail.kernel.org; dmarc=none (p=none dis=none) header.from=kernel.org Authentication-Results: mail.kernel.org; spf=none smtp.mailfrom=frederic@kernel.org From: Frederic Weisbecker To: Ingo Molnar Cc: LKML , Frederic Weisbecker , Peter Zijlstra , Chris Metcalf , Thomas Gleixner , Luiz Capitulino , Christoph Lameter , "Paul E . McKenney" , Wanpeng Li , Mike Galbraith , Rik van Riel Subject: [PATCH 4/5] sched/isolation: Residual 1Hz scheduler tick offload Date: Thu, 4 Jan 2018 05:25:36 +0100 Message-Id: <1515039937-367-5-git-send-email-frederic@kernel.org> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1515039937-367-1-git-send-email-frederic@kernel.org> References: <1515039937-367-1-git-send-email-frederic@kernel.org> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Return-Path: When a CPU runs in full dynticks mode, a 1Hz tick remains in order to keep the scheduler stats alive. However this residual tick is a burden for bare metal tasks that can't stand any interruption at all, or want to minimize them. Adding the boot parameter "isolcpus=nohz_offload" will now outsource these scheduler ticks to the global workqueue so that a housekeeping CPU handles that tick remotely. Note it's still up to the user to affine the global workqueues to the housekeeping CPUs through /sys/devices/virtual/workqueue/cpumask or domains isolation. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Luiz Capitulino Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Ingo Molnar --- kernel/sched/core.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/isolation.c | 4 +++ kernel/sched/sched.h | 2 ++ 3 files changed, 91 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d72d0e9..b964890 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3052,9 +3052,14 @@ void scheduler_tick(void) */ u64 scheduler_tick_max_deferment(void) { - struct rq *rq = this_rq(); - unsigned long next, now = READ_ONCE(jiffies); + struct rq *rq; + unsigned long next, now; + if (!housekeeping_cpu(smp_processor_id(), HK_FLAG_TICK_SCHED)) + return ktime_to_ns(KTIME_MAX); + + rq = this_rq(); + now = READ_ONCE(jiffies); next = rq->last_sched_tick + HZ; if (time_before_eq(next, now)) @@ -3062,7 +3067,82 @@ u64 scheduler_tick_max_deferment(void) return jiffies_to_nsecs(next - now); } -#endif + +struct tick_work { + int cpu; + struct delayed_work work; +}; + +static struct tick_work __percpu *tick_work_cpu; + +static void sched_tick_remote(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct tick_work *twork = container_of(dwork, struct tick_work, work); + int cpu = twork->cpu; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + /* + * Handle the tick only if it appears the remote CPU is running + * in full dynticks mode. The check is racy by nature, but + * missing a tick or having one too much is no big deal. + */ + if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { + rq_lock_irq(rq, &rf); + update_rq_clock(rq); + rq->curr->sched_class->task_tick(rq, rq->curr, 0); + rq_unlock_irq(rq, &rf); + } + + queue_delayed_work(system_unbound_wq, dwork, HZ); +} + +static void sched_tick_start(int cpu) +{ + struct tick_work *twork; + + if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED)) + return; + + WARN_ON_ONCE(!tick_work_cpu); + + twork = per_cpu_ptr(tick_work_cpu, cpu); + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sched_tick_stop(int cpu) +{ + struct tick_work *twork; + + if (housekeeping_cpu(cpu, HK_FLAG_TICK_SCHED)) + return; + + WARN_ON_ONCE(!tick_work_cpu); + + twork = per_cpu_ptr(tick_work_cpu, cpu); + cancel_delayed_work_sync(&twork->work); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +int __init sched_tick_offload_init(void) +{ + tick_work_cpu = alloc_percpu(struct tick_work); + if (!tick_work_cpu) { + pr_err("Can't allocate remote tick struct\n"); + return -ENOMEM; + } + + return 0; +} + +#else +static void sched_tick_start(int cpu) { } +static void sched_tick_stop(int cpu) { } +#endif /* CONFIG_NO_HZ_FULL */ #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) @@ -5713,6 +5793,7 @@ int sched_cpu_starting(unsigned int cpu) { set_cpu_rq_start_time(cpu); sched_rq_cpu_starting(cpu); + sched_tick_start(cpu); return 0; } @@ -5724,6 +5805,7 @@ int sched_cpu_dying(unsigned int cpu) /* Handle pending wakeups and then migrate everything off */ sched_ttwu_pending(); + sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); if (rq->rd) { diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 264ddcd..c5e7e90a 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -12,6 +12,7 @@ #include #include #include +#include "sched.h" DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); EXPORT_SYMBOL_GPL(housekeeping_overriden); @@ -60,6 +61,9 @@ void __init housekeeping_init(void) static_branch_enable(&housekeeping_overriden); + if (housekeeping_flags & HK_FLAG_TICK_SCHED) + sched_tick_offload_init(); + /* We need at least one CPU to handle housekeeping work */ WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b19552a2..5a3b82c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1587,6 +1587,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); +extern int __init sched_tick_offload_init(void); /* * Tick may be needed by tasks in the runqueue depending on their policy and @@ -1611,6 +1612,7 @@ static inline void sched_update_tick_dependency(struct rq *rq) tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } #else +static inline int sched_tick_offload_init(void) { return 0; } static inline void sched_update_tick_dependency(struct rq *rq) { } #endif -- 2.7.4