2015-05-14 11:57:15

by Gu Zheng

[permalink] [raw]
Subject: [RFC PATCH] x86, espfix: use spin_lock rather than mutex

The following lockdep warning occurrs when running with latest kernel:
[ 3.178000] ------------[ cut here ]------------
[ 3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
[ 3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
[ 3.199000] Modules linked in:

[ 3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
[ 3.221000] 0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
[ 3.230000] 0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
[ 3.238000] ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
[ 3.246000] Call Trace:
[ 3.249000] [<ffffffff81773f0a>] dump_stack+0x4c/0x65
[ 3.255000] [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
[ 3.261000] [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
[ 3.268000] [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
[ 3.274000] [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
[ 3.281000] [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
[ 3.288000] [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
[ 3.295000] [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
[ 3.301000] [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
[ 3.308000] [<ffffffff811c869e>] __get_free_pages+0xe/0x50
[ 3.314000] [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
[ 3.320000] [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
[ 3.327000] ---[ end trace 1b3327d9d6a1d62c ]---

This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
init_espfix_ap() which is called before enabled local irq, and the lockdep
sub-system considers this behaviour as allocating memory with GFP_FS with
local irq disabled, then trigger the warning as mentioned about.
Though here we use GFP_NOFS rather GFP_KERNEL to avoid the warning, but
you know, init_espfix_ap is called with preempt and local irq disabled,
it is not a good idea to use mutex (might sleep) here.
So we convert the initialization lock to spin_lock here to avoid the noise.

Signed-off-by: Gu Zheng <[email protected]>
Cc: Stable <[email protected]>
---
arch/x86/kernel/espfix_64.c | 13 +++++++------
1 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index f5d0730..ceb35a3 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -57,14 +57,14 @@
# error "Need more than one PGD for the ESPFIX hack"
#endif

-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_ATOMIC | __GFP_NOTRACK | __GFP_ZERO)

/* This contains the *bottom* address of the espfix stack */
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);

-/* Initialization mutex - should this be a spinlock? */
-static DEFINE_MUTEX(espfix_init_mutex);
+/* Initialization lock */
+static DEFINE_SPINLOCK(espfix_init_lock);

/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
@@ -144,6 +144,7 @@ void init_espfix_ap(void)
int n;
void *stack_page;
pteval_t ptemask;
+ unsigned long flags;

/* We only have to do this once... */
if (likely(this_cpu_read(espfix_stack)))
@@ -158,7 +159,7 @@ void init_espfix_ap(void)
if (likely(stack_page))
goto done;

- mutex_lock(&espfix_init_mutex);
+ spin_lock_irqsave(&espfix_init_lock, flags);

/* Did we race on the lock? */
stack_page = ACCESS_ONCE(espfix_pages[page]);
@@ -188,7 +189,7 @@ void init_espfix_ap(void)
}

pte_p = pte_offset_kernel(&pmd, addr);
- stack_page = (void *)__get_free_page(GFP_KERNEL);
+ stack_page = (void *)__get_free_page(PGALLOC_GFP);
pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);
@@ -197,7 +198,7 @@ void init_espfix_ap(void)
ACCESS_ONCE(espfix_pages[page]) = stack_page;

unlock_done:
- mutex_unlock(&espfix_init_mutex);
+ spin_unlock_irqrestore(&espfix_init_lock, flags);
done:
this_cpu_write(espfix_stack, addr);
this_cpu_write(espfix_waddr, (unsigned long)stack_page
--
1.7.7


2015-05-14 12:26:30

by Borislav Petkov

[permalink] [raw]
Subject: Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex

On Thu, May 14, 2015 at 07:37:45PM +0800, Gu Zheng wrote:
> The following lockdep warning occurrs when running with latest kernel:
> [ 3.178000] ------------[ cut here ]------------
> [ 3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
> [ 3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
> [ 3.199000] Modules linked in:
>
> [ 3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
> [ 3.221000] 0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
> [ 3.230000] 0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
> [ 3.238000] ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
> [ 3.246000] Call Trace:
> [ 3.249000] [<ffffffff81773f0a>] dump_stack+0x4c/0x65
> [ 3.255000] [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
> [ 3.261000] [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
> [ 3.268000] [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
> [ 3.274000] [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
> [ 3.281000] [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
> [ 3.288000] [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
> [ 3.295000] [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
> [ 3.301000] [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
> [ 3.308000] [<ffffffff811c869e>] __get_free_pages+0xe/0x50
> [ 3.314000] [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
> [ 3.320000] [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
> [ 3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
>
> This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
> init_espfix_ap() which is called before enabled local irq, and the lockdep
> sub-system considers this behaviour as allocating memory with GFP_FS with
> local irq disabled, then trigger the warning as mentioned about.
> Though here we use GFP_NOFS rather GFP_KERNEL to avoid the warning, but
> you know, init_espfix_ap is called with preempt and local irq disabled,
> it is not a good idea to use mutex (might sleep) here.
> So we convert the initialization lock to spin_lock here to avoid the noise.
>
> Signed-off-by: Gu Zheng <[email protected]>
> Cc: Stable <[email protected]>
> ---
> arch/x86/kernel/espfix_64.c | 13 +++++++------
> 1 files changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
> index f5d0730..ceb35a3 100644
> --- a/arch/x86/kernel/espfix_64.c
> +++ b/arch/x86/kernel/espfix_64.c
> @@ -57,14 +57,14 @@
> # error "Need more than one PGD for the ESPFIX hack"
> #endif
>
> -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
> +#define PGALLOC_GFP (GFP_ATOMIC | __GFP_NOTRACK | __GFP_ZERO)

IINM, that's ESPFIX_MAX_PAGES with GFP_ATOMIC which for 8K CPUs are 128
pages.

That's a lotta waste in my book for espfix stack pages.

Enabling interrupts earlier in start_secondary() is probably out of the
question, maybe we should prealloc all those pages...

hpa?

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-05-14 18:30:05

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex


* Borislav Petkov <[email protected]> wrote:

> On Thu, May 14, 2015 at 07:37:45PM +0800, Gu Zheng wrote:
> > The following lockdep warning occurrs when running with latest kernel:
> > [ 3.178000] ------------[ cut here ]------------
> > [ 3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
> > [ 3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
> > [ 3.199000] Modules linked in:
> >
> > [ 3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
> > [ 3.221000] 0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
> > [ 3.230000] 0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
> > [ 3.238000] ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
> > [ 3.246000] Call Trace:
> > [ 3.249000] [<ffffffff81773f0a>] dump_stack+0x4c/0x65
> > [ 3.255000] [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
> > [ 3.261000] [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
> > [ 3.268000] [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
> > [ 3.274000] [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
> > [ 3.281000] [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
> > [ 3.288000] [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
> > [ 3.295000] [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
> > [ 3.301000] [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
> > [ 3.308000] [<ffffffff811c869e>] __get_free_pages+0xe/0x50
> > [ 3.314000] [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
> > [ 3.320000] [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
> > [ 3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
> >
> > This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
> > init_espfix_ap() which is called before enabled local irq, and the lockdep
> > sub-system considers this behaviour as allocating memory with GFP_FS with
> > local irq disabled, then trigger the warning as mentioned about.
> > Though here we use GFP_NOFS rather GFP_KERNEL to avoid the warning, but
> > you know, init_espfix_ap is called with preempt and local irq disabled,
> > it is not a good idea to use mutex (might sleep) here.
> > So we convert the initialization lock to spin_lock here to avoid the noise.
> >
> > Signed-off-by: Gu Zheng <[email protected]>
> > Cc: Stable <[email protected]>
> > ---
> > arch/x86/kernel/espfix_64.c | 13 +++++++------
> > 1 files changed, 7 insertions(+), 6 deletions(-)
> >
> > diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
> > index f5d0730..ceb35a3 100644
> > --- a/arch/x86/kernel/espfix_64.c
> > +++ b/arch/x86/kernel/espfix_64.c
> > @@ -57,14 +57,14 @@
> > # error "Need more than one PGD for the ESPFIX hack"
> > #endif
> >
> > -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
> > +#define PGALLOC_GFP (GFP_ATOMIC | __GFP_NOTRACK | __GFP_ZERO)
>
> IINM, that's ESPFIX_MAX_PAGES with GFP_ATOMIC which for 8K CPUs are 128
> pages.
>
> That's a lotta waste in my book for espfix stack pages.
>
> Enabling interrupts earlier in start_secondary() is probably out of
> the question, maybe we should prealloc all those pages...

We could allocate them on the boot CPU side and hand them over to the
secondary CPU.

Thanks,

Ingo

2015-05-14 21:28:05

by Borislav Petkov

[permalink] [raw]
Subject: Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex

Remove stable@ from CC.

On Thu, May 14, 2015 at 08:29:55PM +0200, Ingo Molnar wrote:
> We could allocate them on the boot CPU side and hand them over to the
> secondary CPU.

Yeah, something along those lines. I mean, they're allocated and in-use
during the complete system lifetime, we might just as well allocate them
all in one go. Btw, what's our allocator that early, memblock?

Still, what I find strange is why are we seeing this only now? Is it
because it had to be a big box (cpu >= 128) or something else changed...?

Hohumm.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-05-14 22:27:46

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex

On 05/14/2015 02:27 PM, Borislav Petkov wrote:
> Remove stable@ from CC.
>
> On Thu, May 14, 2015 at 08:29:55PM +0200, Ingo Molnar wrote:
>> We could allocate them on the boot CPU side and hand them over to the
>> secondary CPU.
>
> Yeah, something along those lines. I mean, they're allocated and in-use
> during the complete system lifetime, we might just as well allocate them
> all in one go. Btw, what's our allocator that early, memblock?
>
> Still, what I find strange is why are we seeing this only now? Is it
> because it had to be a big box (cpu >= 128) or something else changed...?
>

Quite probable. You don't really want to allocate them until you know
if a CPU at least exists, though.

I like Ingo's suggestion of allocating them before CPU bringup on the
initiating CPU.

-hpa

2015-05-15 06:54:23

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex


* H. Peter Anvin <[email protected]> wrote:

> On 05/14/2015 02:27 PM, Borislav Petkov wrote:
> > Remove stable@ from CC.
> >
> > On Thu, May 14, 2015 at 08:29:55PM +0200, Ingo Molnar wrote:
> >> We could allocate them on the boot CPU side and hand them over to
> >> the secondary CPU.
> >
> > Yeah, something along those lines. I mean, they're allocated and
> > in-use during the complete system lifetime, we might just as well
> > allocate them all in one go. Btw, what's our allocator that early,
> > memblock?
> >
> > Still, what I find strange is why are we seeing this only now? Is
> > it because it had to be a big box (cpu >= 128) or something else
> > changed...?
> >
>
> Quite probable. You don't really want to allocate them until you
> know if a CPU at least exists, though.
>
> I like Ingo's suggestion of allocating them before CPU bringup on
> the initiating CPU.

The only slightly subtle detail with that is to use alloc_pages_node()
with the secondary CPU's node, to make sure the espfix stack is
NUMA-local to the CPU that is going to use it.

Thanks,

Ingo

2015-05-15 07:28:45

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex

On 05/14/2015 11:54 PM, Ingo Molnar wrote:
>
> The only slightly subtle detail with that is to use alloc_pages_node()
> with the secondary CPU's node, to make sure the espfix stack is
> NUMA-local to the CPU that is going to use it.
>

It doesn't hurt, although it isn't super critical as each page will be
shared among 64 CPUs. The whole espfix stack is only a single cacheline
long.

-hpa

2015-05-18 19:43:13

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex

On 05/15/2015 12:27 AM, H. Peter Anvin wrote:
> On 05/14/2015 11:54 PM, Ingo Molnar wrote:
>>
>> The only slightly subtle detail with that is to use alloc_pages_node()
>> with the secondary CPU's node, to make sure the espfix stack is
>> NUMA-local to the CPU that is going to use it.
>>
>
> It doesn't hurt, although it isn't super critical as each page will be
> shared among 64 CPUs. The whole espfix stack is only a single cacheline
> long.
>

I don't think we actually need these pages allocated until we try to run
user code. Can we move this very late in initialization instead?

--Andy

2015-05-19 15:05:12

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [RFC PATCH] x86, espfix: use spin_lock rather than mutex

On 05/18/2015 12:43 PM, Andy Lutomirski wrote:
> On 05/15/2015 12:27 AM, H. Peter Anvin wrote:
>> On 05/14/2015 11:54 PM, Ingo Molnar wrote:
>>>
>>> The only slightly subtle detail with that is to use alloc_pages_node()
>>> with the secondary CPU's node, to make sure the espfix stack is
>>> NUMA-local to the CPU that is going to use it.
>>>
>>
>> It doesn't hurt, although it isn't super critical as each page will be
>> shared among 64 CPUs. The whole espfix stack is only a single cacheline
>> long.
>>
>
> I don't think we actually need these pages allocated until we try to run
> user code. Can we move this very late in initialization instead?
>

Yes, we could, as long as it is run on each CPU before that CPU tries to
run user code.

-hpa

2015-05-22 10:32:49

by Gu Zheng

[permalink] [raw]
Subject: [RFC PATCH] x86, espfix: postpone the initialization of espfix stack for AP

The following lockdep warning occurs when running with 4.1.0-rc3:
[ 3.178000] ------------[ cut here ]------------
[ 3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
[ 3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
[ 3.199000] Modules linked in:

[ 3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
[ 3.221000] 0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
[ 3.230000] 0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
[ 3.238000] ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
[ 3.246000] Call Trace:
[ 3.249000] [<ffffffff81773f0a>] dump_stack+0x4c/0x65
[ 3.255000] [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
[ 3.261000] [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
[ 3.268000] [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
[ 3.274000] [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
[ 3.281000] [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
[ 3.288000] [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
[ 3.295000] [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
[ 3.301000] [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
[ 3.308000] [<ffffffff811c869e>] __get_free_pages+0xe/0x50
[ 3.314000] [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
[ 3.320000] [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
[ 3.327000] ---[ end trace 1b3327d9d6a1d62c ]---

This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
init_espfix_ap() which is called before enabled local irq, and the lockdep
sub-system considers this behaviour as allocating memory with GFP_FS with
local irq disabled, then trigger the warning as mentioned about.

Though we could allocate them on the boot CPU side and hand them over to
the secondary CPU, but it seems a waste if some of cpus are still offline.
As there is no need to these pages(espfix stack) until we try to run user
code, so we can postpone the initialization of espfix stack after cpu
booted to avoid the noise.

Signed-off-by: Gu Zheng <[email protected]>
---
arch/x86/kernel/smpboot.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 50e547e..3ce05de 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -240,13 +240,6 @@ static void notrace start_secondary(void *unused)
check_tsc_sync_target();

/*
- * Enable the espfix hack for this CPU
- */
-#ifdef CONFIG_X86_ESPFIX64
- init_espfix_ap();
-#endif
-
- /*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
@@ -901,6 +894,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
}
}

+ /*
+ * Enable the espfix hack for this CPU
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ init_espfix_ap();
+#endif
+
/* mark "stuck" area as not stuck */
*trampoline_status = 0;

--
1.8.3.1

2015-06-02 09:44:02

by Gu Zheng

[permalink] [raw]
Subject: Re: [RFC PATCH] x86, espfix: postpone the initialization of espfix stack for AP

Hi Andy,

Sorry for late reply.
On 05/29/2015 09:07 AM, Andy Lutomirski wrote:

> On Wed, May 27, 2015 at 6:20 PM, Gu Zheng <[email protected]> wrote:
>> ping...
>>
>> On 05/22/2015 06:13 PM, Gu Zheng wrote:
>>
>>> The following lockdep warning occurs when running with 4.1.0-rc3:
>>> [ 3.178000] ------------[ cut here ]------------
>>> [ 3.183000] WARNING: CPU: 128 PID: 0 at kernel/locking/lockdep.c:2755 lockdep_trace_alloc+0xdd/0xe0()
>>> [ 3.193000] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
>>> [ 3.199000] Modules linked in:
>>>
>>> [ 3.203000] CPU: 128 PID: 0 Comm: swapper/128 Not tainted 4.1.0-rc3 #70
>>> [ 3.221000] 0000000000000000 2d6601fb3e6d4e4c ffff88086fd5fc38 ffffffff81773f0a
>>> [ 3.230000] 0000000000000000 ffff88086fd5fc90 ffff88086fd5fc78 ffffffff8108c85a
>>> [ 3.238000] ffff88086fd60000 0000000000000092 ffff88086fd60000 00000000000000d0
>>> [ 3.246000] Call Trace:
>>> [ 3.249000] [<ffffffff81773f0a>] dump_stack+0x4c/0x65
>>> [ 3.255000] [<ffffffff8108c85a>] warn_slowpath_common+0x8a/0xc0
>>> [ 3.261000] [<ffffffff8108c8e5>] warn_slowpath_fmt+0x55/0x70
>>> [ 3.268000] [<ffffffff810ee24d>] lockdep_trace_alloc+0xdd/0xe0
>>> [ 3.274000] [<ffffffff811cda0d>] __alloc_pages_nodemask+0xad/0xca0
>>> [ 3.281000] [<ffffffff810ec7ad>] ? __lock_acquire+0xf6d/0x1560
>>> [ 3.288000] [<ffffffff81219c8a>] alloc_page_interleave+0x3a/0x90
>>> [ 3.295000] [<ffffffff8121b32d>] alloc_pages_current+0x17d/0x1a0
>>> [ 3.301000] [<ffffffff811c869e>] ? __get_free_pages+0xe/0x50
>>> [ 3.308000] [<ffffffff811c869e>] __get_free_pages+0xe/0x50
>>> [ 3.314000] [<ffffffff8102640b>] init_espfix_ap+0x17b/0x320
>>> [ 3.320000] [<ffffffff8105c691>] start_secondary+0xf1/0x1f0
>>> [ 3.327000] ---[ end trace 1b3327d9d6a1d62c ]---
>>>
>>> This seems a mis-warning by lockdep, as we alloc pages with GFP_KERNEL in
>>> init_espfix_ap() which is called before enabled local irq, and the lockdep
>>> sub-system considers this behaviour as allocating memory with GFP_FS with
>>> local irq disabled, then trigger the warning as mentioned about.
>>>
>>> Though we could allocate them on the boot CPU side and hand them over to
>>> the secondary CPU, but it seems a waste if some of cpus are still offline.
>>> As there is no need to these pages(espfix stack) until we try to run user
>>> code, so we can postpone the initialization of espfix stack after cpu
>>> booted to avoid the noise.
>
> Does this pass the sigreturn_32 test on both 32-bit and 64-bit kernels
> and sigreturn_64 test on 64-bit kernels? (The test is in
> tools/testing/selftests/x86.) If so, looks good to me.

It failed the test.
There seems a bug in this patch, it alloc espfix stack in the do_boot_cpu
routine, not in the context of target cpu that we want to boot up, so the simple
change is wrong here.
I will send the v2 version soon, and it passed the tests you mentioned above.

Thanks again for your comments and suggestion.

Regards,
Gu


>
> --Andy
>
>>>
>>> Signed-off-by: Gu Zheng <[email protected]>
>>> ---
>>> arch/x86/kernel/smpboot.c | 14 +++++++-------
>>> 1 file changed, 7 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
>>> index 50e547e..3ce05de 100644
>>> --- a/arch/x86/kernel/smpboot.c
>>> +++ b/arch/x86/kernel/smpboot.c
>>> @@ -240,13 +240,6 @@ static void notrace start_secondary(void *unused)
>>> check_tsc_sync_target();
>>>
>>> /*
>>> - * Enable the espfix hack for this CPU
>>> - */
>>> -#ifdef CONFIG_X86_ESPFIX64
>>> - init_espfix_ap();
>>> -#endif
>>> -
>>> - /*
>>> * We need to hold vector_lock so there the set of online cpus
>>> * does not change while we are assigning vectors to cpus. Holding
>>> * this lock ensures we don't half assign or remove an irq from a cpu.
>>> @@ -901,6 +894,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
>>> }
>>> }
>>>
>>> + /*
>>> + * Enable the espfix hack for this CPU
>>> + */
>>> +#ifdef CONFIG_X86_ESPFIX64
>>> + init_espfix_ap();
>>> +#endif
>>> +
>>> /* mark "stuck" area as not stuck */
>>> *trampoline_status = 0;
>>>
>>
>>
>
>
>