2012-08-07 07:29:06

by Zhenzhong Duan

[permalink] [raw]
Subject: [PATCH] Parallelize mtrr init between cpus

Current code serialize mtrr init with set_atomicity_lock.
Mtrr init is quite slow when we bootup on a hvm with large mem, vcpus
and pci passthroughed devices(eg. 24 vcpus + 90G mem).
It took about ~30 mins to bootup, after patch, it took ~2 min.

Signed-off-by: Zhenzhong Duan <[email protected]>
---
arch/x86/kernel/cpu/mtrr/generic.c | 57 +++++++++++++++++-------------------
1 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index e9fe907..a1468b7 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -335,8 +335,9 @@ print_fixed(unsigned base, unsigned step, const mtrr_type *types)
}
}

-static void prepare_set(void);
-static void post_set(void);
+static void prepare_set(unsigned long *cr4_p, u32 *deftype_lo_p,
+ u32 *deftype_hi_p);
+static void post_set(unsigned long cr4, u32 deftype_lo, u32 deftype_hi);

static void __init print_mtrr_state(void)
{
@@ -385,7 +386,8 @@ static void __init print_mtrr_state(void)
void __init get_mtrr_state(void)
{
struct mtrr_var_range *vrs;
- unsigned long flags;
+ unsigned long flags, cr4;
+ u32 deftype_lo, deftype_hi;
unsigned lo, dummy;
unsigned int i;

@@ -420,11 +422,11 @@ void __init get_mtrr_state(void)

/* PAT setup for BP. We need to go through sync steps here */
local_irq_save(flags);
- prepare_set();
+ prepare_set(&cr4, &deftype_lo, &deftype_hi);

pat_init();

- post_set();
+ post_set(cr4, deftype_lo, deftype_hi);
local_irq_restore(flags);
}

@@ -610,15 +612,13 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
return changed;
}

-static u32 deftype_lo, deftype_hi;
-
/**
* set_mtrr_state - Set the MTRR state for this CPU.
*
* NOTE: The CPU must already be in a safe state for MTRR changes.
* RETURNS: 0 if no changes made, else a mask indicating what was changed.
*/
-static unsigned long set_mtrr_state(void)
+static unsigned long set_mtrr_state(u32 *deftype_lo_p, u32 *deftype_hi_p)
{
unsigned long change_mask = 0;
unsigned int i;
@@ -635,10 +635,10 @@ static unsigned long set_mtrr_state(void)
* Set_mtrr_restore restores the old value of MTRRdefType,
* so to set it we fiddle with the saved value:
*/
- if ((deftype_lo & 0xff) != mtrr_state.def_type
- || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
+ if ((*deftype_lo_p & 0xff) != mtrr_state.def_type
+ || ((*deftype_lo_p & 0xc00) >> 10) != mtrr_state.enabled) {

- deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
+ *deftype_lo_p = (*deftype_lo_p & ~0xcff) | mtrr_state.def_type |
(mtrr_state.enabled << 10);
change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
}
@@ -647,9 +647,6 @@ static unsigned long set_mtrr_state(void)
}


-static unsigned long cr4;
-static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
-
/*
* Since we are disabling the cache don't allow any interrupts,
* they would run extremely slow and would only increase the pain.
@@ -657,7 +654,8 @@ static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
* The caller must ensure that local interrupts are disabled and
* are reenabled after post_set() has been called.
*/
-static void prepare_set(void) __acquires(set_atomicity_lock)
+static void prepare_set(unsigned long *cr4_p, u32 *deftype_lo_p,
+ u32 *deftype_hi_p)
{
unsigned long cr0;

@@ -668,8 +666,6 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
* changes to the way the kernel boots
*/

- raw_spin_lock(&set_atomicity_lock);
-
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
cr0 = read_cr0() | X86_CR0_CD;
write_cr0(cr0);
@@ -677,22 +673,22 @@ static void prepare_set(void) __acquires(set_atomicity_lock)

/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ *cr4_p = read_cr4();
+ write_cr4(*cr4_p & ~X86_CR4_PGE);
}

/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
__flush_tlb();

/* Save MTRR state */
- rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
+ rdmsr(MSR_MTRRdefType, *deftype_lo_p, *deftype_hi_p);

/* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
+ mtrr_wrmsr(MSR_MTRRdefType, *deftype_lo_p & ~0xcff, *deftype_hi_p);
wbinvd();
}

-static void post_set(void) __releases(set_atomicity_lock)
+static void post_set(unsigned long cr4, u32 deftype_lo, u32 deftype_hi)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
__flush_tlb();
@@ -706,24 +702,24 @@ static void post_set(void) __releases(set_atomicity_lock)
/* Restore value of CR4 */
if (cpu_has_pge)
write_cr4(cr4);
- raw_spin_unlock(&set_atomicity_lock);
}

static void generic_set_all(void)
{
unsigned long mask, count;
- unsigned long flags;
+ unsigned long flags, cr4;
+ u32 deftype_lo, deftype_hi;

local_irq_save(flags);
- prepare_set();
+ prepare_set(&cr4, &deftype_lo, &deftype_hi);

/* Actually set the state */
- mask = set_mtrr_state();
+ mask = set_mtrr_state(&deftype_lo, &deftype_hi);

/* also set PAT */
pat_init();

- post_set();
+ post_set(cr4, deftype_lo, deftype_hi);
local_irq_restore(flags);

/* Use the atomic bitops to update the global mask */
@@ -748,13 +744,14 @@ static void generic_set_all(void)
static void generic_set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type)
{
- unsigned long flags;
+ unsigned long flags, cr4;
+ u32 deftype_lo, deftype_hi;
struct mtrr_var_range *vr;

vr = &mtrr_state.var_ranges[reg];

local_irq_save(flags);
- prepare_set();
+ prepare_set(&cr4, &deftype_lo, &deftype_hi);

if (size == 0) {
/*
@@ -773,7 +770,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
}

- post_set();
+ post_set(cr4, deftype_lo, deftype_hi);
local_irq_restore(flags);
}

--
1.7.3


2012-08-07 09:01:36

by Zhenzhong Duan

[permalink] [raw]
Subject: Re: [PATCH] Parallelize mtrr init between cpus

I also post a question to xen-devel maillist.
Although this patch fix the long time issue in hvm, but I don't know
why hvm would waste such a long time at bootup.
Also I'm not sure if this patch is correct in all cases.
Maybe I miss something.
link for reference: http://www.gossamer-threads.com/lists/xen/devel/252757

thanks

2012-08-07 15:29, zhenzhong.duan wrote:
> Current code serialize mtrr init with set_atomicity_lock.
> Mtrr init is quite slow when we bootup on a hvm with large mem, vcpus
> and pci passthroughed devices(eg. 24 vcpus + 90G mem).
> It took about ~30 mins to bootup, after patch, it took ~2 min.
>
> Signed-off-by: Zhenzhong Duan <[email protected]>
> ---
> arch/x86/kernel/cpu/mtrr/generic.c | 57 +++++++++++++++++-------------------
> 1 files changed, 27 insertions(+), 30 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
> index e9fe907..a1468b7 100644
> --- a/arch/x86/kernel/cpu/mtrr/generic.c
> +++ b/arch/x86/kernel/cpu/mtrr/generic.c
> @@ -335,8 +335,9 @@ print_fixed(unsigned base, unsigned step, const mtrr_type *types)
> }
> }
>
> -static void prepare_set(void);
> -static void post_set(void);
> +static void prepare_set(unsigned long *cr4_p, u32 *deftype_lo_p,
> + u32 *deftype_hi_p);
> +static void post_set(unsigned long cr4, u32 deftype_lo, u32 deftype_hi);
>
> static void __init print_mtrr_state(void)
> {
> @@ -385,7 +386,8 @@ static void __init print_mtrr_state(void)
> void __init get_mtrr_state(void)
> {
> struct mtrr_var_range *vrs;
> - unsigned long flags;
> + unsigned long flags, cr4;
> + u32 deftype_lo, deftype_hi;
> unsigned lo, dummy;
> unsigned int i;
>
> @@ -420,11 +422,11 @@ void __init get_mtrr_state(void)
>
> /* PAT setup for BP. We need to go through sync steps here */
> local_irq_save(flags);
> - prepare_set();
> + prepare_set(&cr4, &deftype_lo, &deftype_hi);
>
> pat_init();
>
> - post_set();
> + post_set(cr4, deftype_lo, deftype_hi);
> local_irq_restore(flags);
> }
>
> @@ -610,15 +612,13 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
> return changed;
> }
>
> -static u32 deftype_lo, deftype_hi;
> -
> /**
> * set_mtrr_state - Set the MTRR state for this CPU.
> *
> * NOTE: The CPU must already be in a safe state for MTRR changes.
> * RETURNS: 0 if no changes made, else a mask indicating what was changed.
> */
> -static unsigned long set_mtrr_state(void)
> +static unsigned long set_mtrr_state(u32 *deftype_lo_p, u32 *deftype_hi_p)
> {
> unsigned long change_mask = 0;
> unsigned int i;
> @@ -635,10 +635,10 @@ static unsigned long set_mtrr_state(void)
> * Set_mtrr_restore restores the old value of MTRRdefType,
> * so to set it we fiddle with the saved value:
> */
> - if ((deftype_lo & 0xff) != mtrr_state.def_type
> - || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
> + if ((*deftype_lo_p & 0xff) != mtrr_state.def_type
> + || ((*deftype_lo_p & 0xc00) >> 10) != mtrr_state.enabled) {
>
> - deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
> + *deftype_lo_p = (*deftype_lo_p & ~0xcff) | mtrr_state.def_type |
> (mtrr_state.enabled << 10);
> change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
> }
> @@ -647,9 +647,6 @@ static unsigned long set_mtrr_state(void)
> }
>
>
> -static unsigned long cr4;
> -static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
> -
> /*
> * Since we are disabling the cache don't allow any interrupts,
> * they would run extremely slow and would only increase the pain.
> @@ -657,7 +654,8 @@ static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
> * The caller must ensure that local interrupts are disabled and
> * are reenabled after post_set() has been called.
> */
> -static void prepare_set(void) __acquires(set_atomicity_lock)
> +static void prepare_set(unsigned long *cr4_p, u32 *deftype_lo_p,
> + u32 *deftype_hi_p)
> {
> unsigned long cr0;
>
> @@ -668,8 +666,6 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
> * changes to the way the kernel boots
> */
>
> - raw_spin_lock(&set_atomicity_lock);
> -
> /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
> cr0 = read_cr0() | X86_CR0_CD;
> write_cr0(cr0);
> @@ -677,22 +673,22 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
>
> /* Save value of CR4 and clear Page Global Enable (bit 7) */
> if (cpu_has_pge) {
> - cr4 = read_cr4();
> - write_cr4(cr4 & ~X86_CR4_PGE);
> + *cr4_p = read_cr4();
> + write_cr4(*cr4_p & ~X86_CR4_PGE);
> }
>
> /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
> __flush_tlb();
>
> /* Save MTRR state */
> - rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
> + rdmsr(MSR_MTRRdefType, *deftype_lo_p, *deftype_hi_p);
>
> /* Disable MTRRs, and set the default type to uncached */
> - mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
> + mtrr_wrmsr(MSR_MTRRdefType, *deftype_lo_p & ~0xcff, *deftype_hi_p);
> wbinvd();
> }
>
> -static void post_set(void) __releases(set_atomicity_lock)
> +static void post_set(unsigned long cr4, u32 deftype_lo, u32 deftype_hi)
> {
> /* Flush TLBs (no need to flush caches - they are disabled) */
> __flush_tlb();
> @@ -706,24 +702,24 @@ static void post_set(void) __releases(set_atomicity_lock)
> /* Restore value of CR4 */
> if (cpu_has_pge)
> write_cr4(cr4);
> - raw_spin_unlock(&set_atomicity_lock);
> }
>
> static void generic_set_all(void)
> {
> unsigned long mask, count;
> - unsigned long flags;
> + unsigned long flags, cr4;
> + u32 deftype_lo, deftype_hi;
>
> local_irq_save(flags);
> - prepare_set();
> + prepare_set(&cr4, &deftype_lo, &deftype_hi);
>
> /* Actually set the state */
> - mask = set_mtrr_state();
> + mask = set_mtrr_state(&deftype_lo, &deftype_hi);
>
> /* also set PAT */
> pat_init();
>
> - post_set();
> + post_set(cr4, deftype_lo, deftype_hi);
> local_irq_restore(flags);
>
> /* Use the atomic bitops to update the global mask */
> @@ -748,13 +744,14 @@ static void generic_set_all(void)
> static void generic_set_mtrr(unsigned int reg, unsigned long base,
> unsigned long size, mtrr_type type)
> {
> - unsigned long flags;
> + unsigned long flags, cr4;
> + u32 deftype_lo, deftype_hi;
> struct mtrr_var_range *vr;
>
> vr = &mtrr_state.var_ranges[reg];
>
> local_irq_save(flags);
> - prepare_set();
> + prepare_set(&cr4, &deftype_lo, &deftype_hi);
>
> if (size == 0) {
> /*
> @@ -773,7 +770,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
> mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
> }
>
> - post_set();
> + post_set(cr4, deftype_lo, deftype_hi);
> local_irq_restore(flags);
> }
>

2012-08-07 16:33:06

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] Parallelize mtrr init between cpus

On 08/07/2012 12:29 AM, zhenzhong.duan wrote:
> Current code serialize mtrr init with set_atomicity_lock.
> Mtrr init is quite slow when we bootup on a hvm with large mem, vcpus
> and pci passthroughed devices(eg. 24 vcpus + 90G mem).
> It took about ~30 mins to bootup, after patch, it took ~2 min.

When you're saying "HVM" do you mean "Xen HVM"? How does it behave on
native hardware?

-hpa

2012-08-08 02:08:23

by Zhenzhong Duan

[permalink] [raw]
Subject: Re: [PATCH] Parallelize mtrr init between cpus



2012-08-08 00:32, H. Peter Anvin wrote:
> On 08/07/2012 12:29 AM, zhenzhong.duan wrote:
>> Current code serialize mtrr init with set_atomicity_lock.
>> Mtrr init is quite slow when we bootup on a hvm with large mem, vcpus
>> and pci passthroughed devices(eg. 24 vcpus + 90G mem).
>> It took about ~30 mins to bootup, after patch, it took ~2 min.
> When you're saying "HVM" do you mean "Xen HVM"? How does it behave on
> native hardware?
>
> -hpa
Yes, I mean Xen HVM. Bootup at same speed as before on baremetal.

2012-08-08 03:53:28

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] Parallelize mtrr init between cpus

On 08/07/2012 07:08 PM, zhenzhong.duan wrote:
>
>
> 2012-08-08 00:32, H. Peter Anvin wrote:
>> On 08/07/2012 12:29 AM, zhenzhong.duan wrote:
>>> Current code serialize mtrr init with set_atomicity_lock.
>>> Mtrr init is quite slow when we bootup on a hvm with large mem, vcpus
>>> and pci passthroughed devices(eg. 24 vcpus + 90G mem).
>>> It took about ~30 mins to bootup, after patch, it took ~2 min.
>> When you're saying "HVM" do you mean "Xen HVM"? How does it behave on
>> native hardware?
>>
>> -hpa
> Yes, I mean Xen HVM. Bootup at same speed as before on baremetal.

Then I would like to know why Xen HVM takes so infernally long. It
isn't a good idea to make gratuitous changes in the main kernel to work
around defects in Xen.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

2012-08-29 05:15:15

by Zhenzhong Duan

[permalink] [raw]
Subject: Re: [PATCH] Parallelize mtrr init between cpus



On 2012-08-08 11:53, H. Peter Anvin wrote:
> On 08/07/2012 07:08 PM, zhenzhong.duan wrote:
>>
>>
>> 2012-08-08 00:32, H. Peter Anvin wrote:
>>> On 08/07/2012 12:29 AM, zhenzhong.duan wrote:
>>>> Current code serialize mtrr init with set_atomicity_lock.
>>>> Mtrr init is quite slow when we bootup on a hvm with large mem, vcpus
>>>> and pci passthroughed devices(eg. 24 vcpus + 90G mem).
>>>> It took about ~30 mins to bootup, after patch, it took ~2 min.
>>> When you're saying "HVM" do you mean "Xen HVM"? How does it behave on
>>> native hardware?
>>>
>>> -hpa
>> Yes, I mean Xen HVM. Bootup at same speed as before on baremetal.
>
> Then I would like to know why Xen HVM takes so infernally long. It
> isn't a good idea to make gratuitous changes in the main kernel to
> work around defects in Xen.
>
> -hpa
Hi,
Sorrry for late response, I have an opportunity to test the patch on an
old G5 with 128 physical CPUS and 1 TB of memory. Arround 0.82 second
time saved.
Depends on cpu count on the system.
I'm curious what's the usage of set_atomicity_lock here? There may be
something I missed. thanks

Before patch:
[ 13.176038] NMI watchdog enabled, takes one hw-pmu counter.
[ 13.176444] Brought up 128 CPUs
[ 13.176688] Total of 128 processors activated (578760.49 BogoMIPS).
[ 14.035223] devtmpfs: initialized
[ 17.277652] PM: Registering ACPI NVS region at 7f76e000 (401408 bytes)
[ 17.279541] print_constraints: dummy:
[ 17.280576] NET: Registered protocol family 16
After patch:
[ 13.094227] smpboot cpu 127: start_ip = 90000
[ 13.185142] NMI watchdog enabled, takes one hw-pmu counter.
[ 13.185599] Brought up 128 CPUs
[ 13.185851] Total of 128 processors activated (578871.80 BogoMIPS).
[ 13.224783] devtmpfs: initialized
[ 16.468002] PM: Registering ACPI NVS region at 7f76e000 (401408 bytes)
[ 16.469706] print_constraints: dummy:
[ 16.470745] NET: Registered protocol family 16