DMARC-Filter: OpenDMARC Filter v1.3.2 mx1.redhat.com 8C921C047B6F
Date: Tue, 29 Aug 2017 17:56:46 +0300
From: "Michael S. Tsirkin" <mst@redhat.com>
To: Yang Zhang <yang.zhang.wz@gmail.com>
Cc: linux-kernel@vger.kernel.org, kvm@vger.kernel.org,
        wanpeng.li@hotmail.com, pbonzini@redhat.com, tglx@linutronix.de,
        rkrcmar@redhat.com, dmatlack@google.com, agraf@suse.de,
        peterz@infradead.org, linux-doc@vger.kernel.org
Subject: Re: [RFC PATCH v2 0/7] x86/idle: add halt poll support
Message-ID: <20170829174147-mutt-send-email-mst@kernel.org>
References: <1504007201-12904-1-git-send-email-yang.zhang.wz@gmail.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <1504007201-12904-1-git-send-email-yang.zhang.wz@gmail.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 8140
Lines: 256

On Tue, Aug 29, 2017 at 11:46:34AM +0000, Yang Zhang wrote:
> Some latency-intensive workload will see obviously performance 
> drop when running inside VM.

But are we trading a lot of CPU for a bit of lower latency?

> The main reason is that the overhead 
> is amplified when running inside VM. The most cost i have seen is 
> inside idle path. 
> 
> This patch introduces a new mechanism to poll for a while before 
> entering idle state. If schedule is needed during poll, then we 
> don't need to goes through the heavy overhead path. 

Isn't it the job of an idle driver to find the best way to
halt the CPU?

It looks like just by adding a cstate we can make it
halt at higher latencies only. And at lower latencies,
if it's doing a good job we can hopefully use mwait to
stop the CPU.

In fact I have been experimenting with exactly that.
Some initial results are encouraging but I could use help
with testing and especially tuning. If you can help
pls let me know!

Patch below is not intended for upstream - it's just
the fastest way I found to test things.
So it just uses command line arguments to configure the guest,
the right thing is through a combination of ACPI and CPUIDs
but let's decide whether we need this first.

RFC dontmerge PATCH intel_idle: add pv cstates when running on kvm

Usage:

kvm_pv_mwait - enables the feature. Note: you must have a recent
		host that allows guests to execute mwait without an exit,
		otherwise you will just get 100% CPU.

kvm_halt_target_residency - halt above this target residency.
		Should probably be a function of the cost of
		halt+wakeup.

kvm_halt_native - set to 0 if your VCPU does not match host.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c2ae819..6fa58ad 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -65,8 +65,10 @@
 #include <asm/intel-family.h>
 #include <asm/mwait.h>
 #include <asm/msr.h>
+#include <linux/kvm_para.h>
 
 #define INTEL_IDLE_VERSION "0.4.1"
+#define PREFIX "intel_idle: "
 
 static struct cpuidle_driver intel_idle_driver = {
 	.name = "intel_idle",
@@ -94,6 +96,7 @@ struct idle_cpu {
 };
 
 static const struct idle_cpu *icpu;
+static struct idle_cpu icpus;
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 static int intel_idle(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv, int index);
@@ -119,6 +122,49 @@ static struct cpuidle_state *cpuidle_state_table;
 #define flg2MWAIT(flags) (((flags) >> 24) & 0xFF)
 #define MWAIT2flg(eax) ((eax & 0xFF) << 24)
 
+static int intel_halt(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv, int index)
+{
+	printk_once(KERN_ERR "safe_halt started\n");
+	safe_halt();
+	printk_once(KERN_ERR "safe_halt done\n");
+	return index;
+}
+
+static int kvm_halt_target_residency = 400; /* Halt above this target residency */
+module_param(kvm_halt_target_residency, int, 0444);
+static int kvm_halt_native = 1; /* Use native mwait substates */
+module_param(kvm_halt_native, int, 0444);
+static int kvm_pv_mwait = 0; /* Whether to do mwait within KVM */
+module_param(kvm_pv_mwait, int, 0444);
+
+static struct cpuidle_state kvm_halt_cstate = {
+	.name = "HALT-KVM",
+	.desc = "HALT",
+	.flags = MWAIT2flg(0x10),
+	.exit_latency = 0,
+	.target_residency = 0,
+	.enter = &intel_halt,
+};
+
+static struct cpuidle_state kvm_cstates[] = {
+	{
+		.name = "C1-NHM",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 3,
+		.target_residency = 6,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze, },
+	{
+		.name = "HALT-KVM",
+		.desc = "HALT",
+		.flags = MWAIT2flg(0x10),
+		.exit_latency = 30,
+		.target_residency = 399,
+		.enter = &intel_halt, }
+};
+
 /*
  * States are indexed by the cstate number,
  * which is also the index into the MWAIT hint array.
@@ -927,8 +973,11 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	if (!(lapic_timer_reliable_states & (1 << (cstate))))
 		tick_broadcast_enter();
 
+	printk_once(KERN_ERR "mwait_idle_with_hints started\n");
 	mwait_idle_with_hints(eax, ecx);
 
+	printk_once(KERN_ERR "mwait_idle_with_hints done\n");
+
 	if (!(lapic_timer_reliable_states & (1 << (cstate))))
 		tick_broadcast_exit();
 
@@ -989,6 +1038,10 @@ static const struct idle_cpu idle_cpu_tangier = {
 	.state_table = tangier_cstates,
 };
 
+static const struct idle_cpu idle_cpu_kvm = {
+	.state_table = kvm_cstates,
+};
+
 static const struct idle_cpu idle_cpu_lincroft = {
 	.state_table = atom_cstates,
 	.auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
@@ -1061,7 +1115,7 @@ static const struct idle_cpu idle_cpu_dnv = {
 };
 
 #define ICPU(model, cpu) \
-	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu }
+	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&cpu }
 
 static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	ICPU(INTEL_FAM6_NEHALEM_EP,		idle_cpu_nehalem),
@@ -1125,19 +1180,39 @@ static int __init intel_idle_probe(void)
 		return -ENODEV;
 	}
 
-	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
-		return -ENODEV;
+	icpus = *(struct idle_cpu *)id->driver_data;
+
+	if (kvm_pv_mwait) {
+
+		if (!kvm_halt_native)
+			icpus = idle_cpu_kvm;
+
+		pr_debug(PREFIX "MWAIT enabled by KVM\n");
+		mwait_substates = 0x1;
+		/*
+		 * these MSRs do not work on kvm maybe they should?
+		 * more likely we need to poke at CPUID before using MSRs
+		 */
+		icpus.auto_demotion_disable_flags = 0;
+		icpus.disable_promotion_to_c1e = 0;
+	} else {
+		if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT))
+			return -ENODEV;
+
+		if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+			return -ENODEV;
 
-	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
+		cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
 
-	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
-	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
-	    !mwait_substates)
+		if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+		    !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
+		    !mwait_substates)
 			return -ENODEV;
 
-	pr_debug("MWAIT substates: 0x%x\n", mwait_substates);
+		pr_debug(PREFIX "MWAIT substates: 0x%x\n", mwait_substates);
+	}
 
-	icpu = (const struct idle_cpu *)id->driver_data;
+	icpu = &icpus;
 	cpuidle_state_table = icpu->state_table;
 
 	pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
@@ -1340,6 +1415,11 @@ static void __init intel_idle_cpuidle_driver_init(void)
 		    (cpuidle_state_table[cstate].enter_freeze == NULL))
 			break;
 
+		if (kvm_pv_mwait &&
+		    cpuidle_state_table[cstate].target_residency >=
+		    kvm_halt_target_residency)
+			break;
+
 		if (cstate + 1 > max_cstate) {
 			pr_info("max_cstate %d reached\n", max_cstate);
 			break;
@@ -1353,7 +1433,7 @@ static void __init intel_idle_cpuidle_driver_init(void)
 					& MWAIT_SUBSTATE_MASK;
 
 		/* if NO sub-states for this state in CPUID, skip it */
-		if (num_substates == 0)
+		if (num_substates == 0 && !kvm_pv_mwait)
 			continue;
 
 		/* if state marked as disabled, skip it */
@@ -1375,6 +1455,20 @@ static void __init intel_idle_cpuidle_driver_init(void)
 		drv->state_count += 1;
 	}
 
+	if (kvm_halt_native && kvm_pv_mwait) {
+		drv->states[drv->state_count] =	/* structure copy */
+			kvm_halt_cstate;
+		drv->states[drv->state_count].exit_latency =
+			drv->state_count > 1 ?
+			drv->states[drv->state_count - 1].exit_latency + 1 : 1;
+		drv->states[drv->state_count].target_residency =
+			kvm_halt_target_residency;
+
+		drv->state_count += 1;
+	}
+
+	printk(KERN_ERR "detected states: %d\n\n",  drv->state_count);
+
 	if (icpu->byt_auto_demotion_disable_flag) {
 		wrmsrl(MSR_CC6_DEMOTION_POLICY_CONFIG, 0);
 		wrmsrl(MSR_MC6_DEMOTION_POLICY_CONFIG, 0);
@@ -1452,7 +1546,8 @@ static int __init intel_idle_init(void)
 		goto init_driver_fail;
 	}
 
-	if (boot_cpu_has(X86_FEATURE_ARAT))	/* Always Reliable APIC Timer */
+	if (boot_cpu_has(X86_FEATURE_ARAT) ||	/* Always Reliable APIC Timer */
+	    kvm_pv_mwait)
 		lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
 
 	retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",