Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.18 as permitted sender) client-ip=23.128.96.18;
Sender: "joshdon via sendgmr" <joshdon@joshdon.svl.corp.google.com>
Date:   Thu, 22 Oct 2020 20:29:42 -0700
Message-Id: <20201023032944.399861-1-joshdon@google.com>
Mime-Version: 1.0
Subject: [PATCH 1/3] sched: better handling for busy polling loops
From:   Josh Don <joshdon@google.com>
To:     Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Juri Lelli <juri.lelli@redhat.com>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        "David S. Miller" <davem@davemloft.net>,
        Jakub Kicinski <kuba@kernel.org>
Cc:     Dietmar Eggemann <dietmar.eggemann@arm.com>,
        Steven Rostedt <rostedt@goodmis.org>,
        Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
        Paolo Bonzini <pbonzini@redhat.com>,
        Eric Dumazet <edumazet@google.com>,
        linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
        kvm@vger.kernel.org, Josh Don <joshdon@google.com>,
        Xi Wang <xii@google.com>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk

Busy polling loops in the kernel such as network socket poll and kvm
halt polling have performance problems related to process scheduler load
accounting.

Both of the busy polling examples are opportunistic - they relinquish
the cpu if another thread is ready to run. This design, however, doesn't
extend to multiprocessor load balancing very well. The scheduler still
sees the busy polling cpu as 100% busy and will be less likely to put
another thread on that cpu. In other words, if all cores are 100%
utilized and some of them are running real workloads and some others are
running busy polling loops, newly woken up threads will not prefer the
busy polling cpus. System wide throughput and latency may suffer.

This change allows the scheduler to detect busy polling cpus in order to
allow them to be more frequently considered for wake up balancing.

This change also disables preemption for the duration of the busy
polling loop. This is important, as it ensures that if a polling thread
decides to end its poll to relinquish cpu to another thread, the polling
thread will actually exit the busy loop and potentially block. When it
later becomes runnable, it will have the opportunity to find an idle cpu
via wakeup cpu selection.

Suggested-by: Xi Wang <xii@google.com>
Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Xi Wang <xii@google.com>
---
 include/linux/sched.h |  5 +++
 kernel/sched/core.c   | 94 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c   | 25 ++++++++----
 kernel/sched/sched.h  |  2 +
 4 files changed, 119 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index afe01e232935..80ef477e5a87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1651,6 +1651,7 @@ extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
 extern int available_idle_cpu(int cpu);
+extern int polling_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
 extern void sched_set_fifo(struct task_struct *p);
@@ -2048,4 +2049,8 @@ int sched_trace_rq_nr_running(struct rq *rq);
 
 const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 
+extern void prepare_to_busy_poll(void);
+extern int continue_busy_poll(void);
+extern void end_busy_poll(bool allow_resched);
+
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d95dc3f4644..2783191d0bd4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5107,6 +5107,24 @@ int available_idle_cpu(int cpu)
 	return 1;
 }
 
+/**
+ * polling_cpu - is a given CPU currently running a thread in a busy polling
+ * loop that could be preempted if a new thread were to be scheduled?
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently polling. 0 otherwise.
+ */
+int polling_cpu(int cpu)
+{
+#ifdef CONFIG_SMP
+	struct rq *rq = cpu_rq(cpu);
+
+	return unlikely(rq->busy_polling);
+#else
+	return 0;
+#endif
+}
+
 /**
  * idle_task - return the idle task for a given CPU.
  * @cpu: the processor in question.
@@ -7191,6 +7209,7 @@ void __init sched_init(void)
 
 		rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
 #endif
+		rq->busy_polling = 0;
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
@@ -7417,6 +7436,81 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
 
 #endif
 
+/*
+ * Calling this function before entering a preemptible busy polling loop will
+ * help the scheduler make better load balancing decisions. Wake up balance
+ * will treat the polling cpu as idle.
+ *
+ * Preemption is disabled inside this function and re-enabled in
+ * end_busy_poll(), thus the polling loop must periodically check
+ * continue_busy_poll().
+ *
+ * REQUIRES: prepare_to_busy_poll(), continue_busy_poll(), and end_busy_poll()
+ * must be used together.
+ */
+void prepare_to_busy_poll(void)
+{
+	struct rq __maybe_unused *rq = this_rq();
+	unsigned long __maybe_unused flags;
+
+	/* Preemption will be reenabled by end_busy_poll() */
+	preempt_disable();
+
+#ifdef CONFIG_SMP
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	/* preemption disabled; only one thread can poll at a time */
+	WARN_ON_ONCE(rq->busy_polling);
+	rq->busy_polling++;
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+#endif
+}
+EXPORT_SYMBOL(prepare_to_busy_poll);
+
+int continue_busy_poll(void)
+{
+	if (!single_task_running())
+		return 0;
+
+	/* Important that we check this, since preemption is disabled */
+	if (need_resched())
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL(continue_busy_poll);
+
+/*
+ * Restore any state modified by prepare_to_busy_poll(), including re-enabling
+ * preemption.
+ *
+ * @allow_resched: If true, this potentially calls schedule() as part of
+ * enabling preemption. A busy poll loop can use false in order to have an
+ * opportunity to block before rescheduling.
+ */
+void end_busy_poll(bool allow_resched)
+{
+#ifdef CONFIG_SMP
+	struct rq *rq = this_rq();
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	BUG_ON(!rq->busy_polling); /* not paired with prepare() */
+	rq->busy_polling--;
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+#endif
+
+	/*
+	 * preemption needs to be kept disabled between prepare_to_busy_poll()
+	 * and end_busy_poll().
+	 */
+	BUG_ON(preemptible());
+	if (allow_resched)
+		preempt_enable();
+	else
+		preempt_enable_no_resched();
+}
+EXPORT_SYMBOL(end_busy_poll);
+
 #ifdef CONFIG_CGROUP_SCHED
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1a68a0536add..58e525c74cc6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5460,6 +5460,11 @@ static int sched_idle_cpu(int cpu)
 {
 	return sched_idle_rq(cpu_rq(cpu));
 }
+
+static int sched_idle_or_polling_cpu(int cpu)
+{
+	return sched_idle_cpu(cpu) || polling_cpu(cpu);
+}
 #endif
 
 /*
@@ -5880,6 +5885,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 	u64 latest_idle_timestamp = 0;
 	int least_loaded_cpu = this_cpu;
 	int shallowest_idle_cpu = -1;
+	int found_polling = 0;
 	int i;
 
 	/* Check if we have any choice: */
@@ -5914,10 +5920,14 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 				shallowest_idle_cpu = i;
 			}
 		} else if (shallowest_idle_cpu == -1) {
+			int polling = polling_cpu(i);
+
 			load = cpu_load(cpu_rq(i));
-			if (load < min_load) {
+			if ((polling == found_polling && load < min_load) ||
+			    (polling && !found_polling)) {
 				min_load = load;
 				least_loaded_cpu = i;
+				found_polling = polling;
 			}
 		}
 	}
@@ -6085,7 +6095,7 @@ static int select_idle_smt(struct task_struct *p, int target)
 	for_each_cpu(cpu, cpu_smt_mask(target)) {
 		if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 			continue;
-		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+		if (available_idle_cpu(cpu) || sched_idle_or_polling_cpu(cpu))
 			return cpu;
 	}
 
@@ -6149,7 +6159,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	for_each_cpu_wrap(cpu, cpus, target) {
 		if (!--nr)
 			return -1;
-		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+		if (available_idle_cpu(cpu) || sched_idle_or_polling_cpu(cpu))
 			break;
 	}
 
@@ -6179,7 +6189,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	for_each_cpu_wrap(cpu, cpus, target) {
 		unsigned long cpu_cap = capacity_of(cpu);
 
-		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+		if (!available_idle_cpu(cpu) && !sched_idle_or_polling_cpu(cpu))
 			continue;
 		if (task_fits_capacity(p, cpu_cap))
 			return cpu;
@@ -6223,14 +6233,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	}
 
 symmetric:
-	if (available_idle_cpu(target) || sched_idle_cpu(target))
+	if (available_idle_cpu(target) || sched_idle_or_polling_cpu(target))
 		return target;
 
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
 	if (prev != target && cpus_share_cache(prev, target) &&
-	    (available_idle_cpu(prev) || sched_idle_cpu(prev)))
+	    (available_idle_cpu(prev) || sched_idle_or_polling_cpu(prev)))
 		return prev;
 
 	/*
@@ -6252,7 +6262,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
-	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+	    (available_idle_cpu(recent_used_cpu) ||
+	     sched_idle_or_polling_cpu(recent_used_cpu)) &&
 	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
 		/*
 		 * Replace recent_used_cpu with prev as it is a potential
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 28709f6b0975..45de468d0ffb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1003,6 +1003,8 @@ struct rq {
 
 	/* This is used to determine avg_idle's max value */
 	u64			max_idle_balance_cost;
+
+	unsigned int		busy_polling;
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-- 
2.29.0.rc1.297.gfa9743e501-goog