From: Aaron Lu <aaron.lu@intel.com>
To: linux-kernel@vger.kernel.org
Cc: Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>,
        Thomas Gleixner <tglx@linutronix.de>,
        Tim Chen <tim.c.chen@linux.intel.com>,
        Dave Hansen <dave.hansen@intel.com>, Huang Ying <ying.huang@intel.com>,
        Aaron Lu <aaron.lu@intel.com>
Subject: [PATCH] smp: do not send IPI if call_single_queue not empty
Date: Fri, 28 Apr 2017 21:19:44 +0800
Message-Id: <20170428131944.11928-1-aaron.lu@intel.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 3697
Lines: 104

For smp_call_function_many(), whenever a CPU queues a CSD to a target
CPU, it will send an IPI to let the target CPU to handle the work.
This isn't necessary - we need only send IPI when queueing a CSD
to an empty call_single_queue.

The reason:
flush_smp_call_function_queue() that is called upon a CPU receiving an
IPI will empty the queue and then handle all of the CSDs there. So if
the target CPU's call_single_queue is not empty, we know that:
i.  An IPI for the target CPU has already been sent by previous queuers*;
ii. flush_smp_call_function_queue() hasn't emptied that CPU's queue yet.
Thus, it's safe for us to just queue our CSD there without sending an
addtional IPI.

*For previous queuers, we can limit it to the first queuer.

The workload used to see the effectiveness of this change is
vm-scalability's case-swap-w-seq-mt:
https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git/tree/case-anon-w-seq-mt

What it does is to spawn 88 threads to do continous anonymous memory
consumption. The test machine is a 2-node Broadwell EP with 88 threads
and 32G memory. The test size is 100G so a lot of swap out happened
after available memory is used up. Due to:
i.  shrink_page_list() will need to
    try_to_unmap_flush() -> flush_tlb_others();
ii. this process' mm_cpumask() is almost full.
the number of IPI sent during the workload is huge.

Base:
"interrupts.CAL:Function_call_interrupts": 819388170.0,
"vm-scalability.throughput": 5051472.0,
This patch:
"interrupts.CAL:Function_call_interrupts": 92214434.66666667, ↓88.7%
"vm-scalability.throughput": 5991764.333333333 ↑18.6%

Interrupts dropped a lot and performance increased 18.6%.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
 kernel/smp.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index a817769b53c0..76d16fe3c427 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -30,6 +30,7 @@ enum {
 struct call_function_data {
 	struct call_single_data	__percpu *csd;
 	cpumask_var_t		cpumask;
+	cpumask_var_t		cpumask_ipi;
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu)
 	if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
 				     cpu_to_node(cpu)))
 		return -ENOMEM;
+	if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+				     cpu_to_node(cpu))) {
+		free_cpumask_var(cfd->cpumask);
+		return -ENOMEM;
+	}
 	cfd->csd = alloc_percpu(struct call_single_data);
 	if (!cfd->csd) {
 		free_cpumask_var(cfd->cpumask);
+		free_cpumask_var(cfd->cpumask_ipi);
 		return -ENOMEM;
 	}
 
@@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu)
 	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
 
 	free_cpumask_var(cfd->cpumask);
+	free_cpumask_var(cfd->cpumask_ipi);
 	free_percpu(cfd->csd);
 	return 0;
 }
@@ -434,6 +442,7 @@ void smp_call_function_many(const struct cpumask *mask,
 	if (unlikely(!cpumask_weight(cfd->cpumask)))
 		return;
 
+	cpumask_clear(cfd->cpumask_ipi);
 	for_each_cpu(cpu, cfd->cpumask) {
 		struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
 
@@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask,
 			csd->flags |= CSD_FLAG_SYNCHRONOUS;
 		csd->func = func;
 		csd->info = info;
-		llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
+		if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
+			cpumask_set_cpu(cpu, cfd->cpumask_ipi);
 	}
 
 	/* Send a message to all CPUs in the map */
-	arch_send_call_function_ipi_mask(cfd->cpumask);
+	arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
 
 	if (wait) {
 		for_each_cpu(cpu, cfd->cpumask) {
-- 
2.9.3