Hi Thomas,
I've been chasing down a use-after-free on an irq_desc structure during
repeated device removal testing that crashes 4.3 in
smp_irq_move_cleanup_interrupt. So far I have a bunch of crashes and some
data gleaned from instrumenting the kernel with trace logging. Details to
follow, but in short, I think I'm hitting the use-after-free because of
a782a7e46bb5 "x86/irq: Store irq descriptor in vector array".
During the tests, I usually see RCU stalls (sometimes a crash later) --
when looking at a vmcore, the CPU stall is here:
crash> bt ffff881038ee4200
PID: 0 TASK: ffff881038ee4200 CPU: 1 COMMAND: "swapper/1"
[exception RIP: _raw_spin_lock+0x10]
RIP: ffffffff81672a10 RSP: ffff88103f843f70 RFLAGS: 00000046
RAX: 0000000000000000 RBX: 0000000000000091 RCX: 0000000000000001
RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffffffff81d36a88
RBP: ffff88103f843fa8 R8: 0000000000000101 R9: 0000000000000f78
R10: 0000000000100000 R11: ffff8810384ccf8c R12: 0000000000000091
R13: ffff88102a819794 R14: 000000000000b5e8 R15: ffff88102a8196f8
CS: 0010 SS: 0018
#0 [ffff88103f843f70] smp_irq_move_cleanup_interrupt at ffffffff8105319b
#1 [ffff88103f843fb0] irq_move_cleanup_interrupt at ffffffff81673a04
--- <IRQ stack> ---
#2 [ffff881038f0fd90] irq_move_cleanup_interrupt at ffffffff81673a04
[exception RIP: unknown or invalid address]
RIP: 000000000000001d RSP: 0000000000000000 RFLAGS: ffff881038f10000
RAX: 0000000000000001 RBX: ffff881038f0c000 RCX: 0000000000000005
RDX: 00000000000a087a RSI: 0000000000051ca2 RDI: 0000000000000018
RBP: ffff88103f84f100 R8: ffff881038f0fea0 R9: ffffe8f000042378
R10: 0000000000000002 R11: 000000ee954efd02 R12: ffffffff810e5060
R13: ffff881038f0fdc8 R14: ffff88103f84f500 R15: ffff88103f84f100
ORIG_RAX: ffff88103f856c40 CS: 8000a044 SS: ffffffffffffffdf
bt: WARNING: possibly bogus exception frame
#3 [ffff881038f0fe38] cpuidle_enter_state at ffffffff8151e278
#4 [ffff881038f0fea8] cpuidle_enter at ffffffff8151e417
#5 [ffff881038f0feb8] call_cpuidle at ffffffff810bef42
#6 [ffff881038f0fed0] cpu_startup_entry at ffffffff810bf1dc
#7 [ffff881038f0ff28] start_secondary at ffffffff8104e9e0
The irq_desc is in R15: ffff88102a8196f8
crash> kmem ffff88102a8196f8
CACHE NAME OBJSIZE ALLOCATED TOTAL SLABS SSIZE
ffff88103f408c80 kmalloc-512 512 5292 5460 140 32k
SLAB MEMORY NODE TOTAL ALLOCATED FREE
ffffea0040aa0600 ffff88102a818000 0 39 17 22
FREE / [ALLOCATED]
ffff88102a8196f8
PAGE PHYSICAL MAPPING INDEX CNT FLAGS
ffffea0040aa0640 102a819000 0 0 0 2fffff80008000 tail
This irq_desc is no longer allocated, it's been filled with the slub debug
poison pattern (hence the spinlock is stuck):
crash> rd ffff88102a8196f8 0x28
ffff88102a8196f8: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819708: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819718: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819728: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819738: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819748: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819758: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819768: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819778: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819788: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819798: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a8197a8: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a8197b8: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a8197c8: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a8197d8: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a8197e8: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a8197f8: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819808: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819818: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
ffff88102a819828: 6b6b6b6b6b6b6b6b 6b6b6b6b6b6b6b6b kkkkkkkkkkkkkkkk
The irq vector is in RBX: 0000000000000091
crash> p/d 0x91
$1 = 145
The irq_desc_tree entry for vector 0x91 appears to have been updated to
something new (the device re-add):
crash> tree -t radix irq_desc_tree -s irq_desc | grep 'irq = 0x91' -B11 | grep -e '^ffff' -e 'irq ='
ffff88102fb144e8
irq = 0x91,
But from a custom crash gdb script, the freed irq_desc can still be found
in CPU 1's vector_irq[145]:
cpu[1] vector_irq[145] irq_desc @ 0xffff88102a8196f8
Sifting through a heavily grepped and abbreviated trace log:
- The irq moved from CPU 1, vector 145 to CPU 44, vector 134
- Both CPUs skip cleaning up their vector_irq[] entries for this irq
because data->move_in_progress is set (is this normal?)
- The irq moves again to CPU 34, vector 123
- The underlying device is removed @ 593.106514 jiffies
- The irq_desc is freed @ 593.239879 jiffies
- CPU 1 is last heard from @ 1022.830083 jiffies
[001] 22.936764: __assign_irq_vector : cpu 44 : vector=134 -> 0xffff88102a8196f8
[044] 35.209945: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=134 0xffff88102a8196f8
[001] 35.212370: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
[044] 35.674114: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=134 0xffff88102a8196f8
[001] 35.675395: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
[044] 36.265716: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=134 0xffff88102a8196f8
[044] 36.517785: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=134 0xffff88102a8196f8
[001] 36.520115: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
[001] 54.636651: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
[001] 54.636722: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
[044] 61.670267: __assign_irq_vector : cpu 34 : vector=123 -> 0xffff88102a8196f8
[001] 61.670979: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
[044] 61.696120: smp_irq_move_cleanup_interrupt : cpu (this) : vector=134 0xffff88102a8196f8 -> (nil)
[034] 69.086107: smp_irq_move_cleanup_interrupt : vector == data->cfg.vector && ... : vector=123 0xffff88102a8196f8
[000] 593.239811: clear_irq_vector : 1 : cpu 34 : vector=123 0xffff88102a8196f8 -> (nil)
[000] 593.239879: free_desc : desc @ 0xffff88102a8196f8
[001] 1022.830083: smp_irq_move_cleanup_interrupt : IS_ERR_OR_NULL : vector=144 (nil)
crash> log | grep 'mpt3sas[0-9]*: _scsih_remove$'
[ 291.658410] mpt3sas0: _scsih_remove
[ 593.106514] mpt3sas1: _scsih_remove << free_desc
[ 874.345239] mpt3sas3: _scsih_remove
Prior to a782a7e46bb5 "x86/irq: Store irq descriptor in vector array", the
vector_irq array held irq values, those interested in the irq_desc would
have to perform their own irq_to_desc() (ie, a radix_tree_lookup of the
irq_desc_tree). The radix tree is updated in free_desc(), so any
subsequent lookups would fail.
After the change though, I'm not so sure that direct access to the
irq_desc is safe -- I couldn't figure out anything preventing
use-after-free (the sparse_irq_lock and the vector_lock seem to protect
their respective data-structures, but not the dual-use of the irq_desc's
by both the irq_desc_tree and the per-cpu vector_irq[].)
Hopefully this is on the right track -- I can modify the trace logging or
run other tests if there is additional information that would be helpful.
Regards,
-- Joe
Joe,
On Mon, 23 Nov 2015, Joe Lawrence wrote:
Nice detective work!
> The irq_desc is in R15: ffff88102a8196f8
>
> This irq_desc is no longer allocated, it's been filled with the slub debug
> poison pattern (hence the spinlock is stuck):
>
> The irq vector is in RBX: 0000000000000091
>
> But from a custom crash gdb script, the freed irq_desc can still be found
> in CPU 1's vector_irq[145]:
>
> cpu[1] vector_irq[145] irq_desc @ 0xffff88102a8196f8
>
> Sifting through a heavily grepped and abbreviated trace log:
>
> - The irq moved from CPU 1, vector 145 to CPU 44, vector 134
> - Both CPUs skip cleaning up their vector_irq[] entries for this irq
> because data->move_in_progress is set (is this normal?)
Yes. The cleanup interrupt is not targetting a particular vector.
> - The irq moves again to CPU 34, vector 123
> - The underlying device is removed @ 593.106514 jiffies
> - The irq_desc is freed @ 593.239879 jiffies
> - CPU 1 is last heard from @ 1022.830083 jiffies
>
> [001] 22.936764: __assign_irq_vector : cpu 44 : vector=134 -> 0xffff88102a8196f8
> [044] 35.209945: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=134 0xffff88102a8196f8
> [001] 35.212370: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
> [044] 35.674114: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=134 0xffff88102a8196f8
> [001] 35.675395: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
> [044] 36.265716: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=134 0xffff88102a8196f8
> [044] 36.517785: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=134 0xffff88102a8196f8
> [001] 36.520115: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
> [001] 54.636651: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
> [001] 54.636722: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
So I assume, that no interrupt happened so far.
> [044] 61.670267: __assign_irq_vector : cpu 34 : vector=123 -> 0xffff88102a8196f8
Now it's moved again.
> [001] 61.670979: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
> [044] 61.696120: smp_irq_move_cleanup_interrupt : cpu (this) : vector=134 0xffff88102a8196f8 -> (nil)
> [034] 69.086107: smp_irq_move_cleanup_interrupt : vector == data->cfg.vector && ... : vector=123 0xffff88102a8196f8
> [000] 593.239811: clear_irq_vector : 1 : cpu 34 : vector=123 0xffff88102a8196f8 -> (nil)
> [000] 593.239879: free_desc : desc @ 0xffff88102a8196f8
> [001] 1022.830083: smp_irq_move_cleanup_interrupt : IS_ERR_OR_NULL : vector=144 (nil)
Ok. Here is issue. So I assume the following happens:
CPU0 CPU1 CPU34 CPU44 CPUxx
send_IPI(CLEANUP)
smp_irq_move_cleanup_interrupt
sees data->move_in_progress
device_irq happens
data->move_in_progress = 0
send_IPI(CLEANUP)
Does not receive another
cleanup IPI for whatever
reason .... See below.
smp_irq_move_cleanup_interrupt
vector = NULL
free_vector on CPU34
Does not clean CPU1 because
data->move_in_progress is not set.
free_desc
send_IPI(CLEANUP)
smp_irq_move_cleanup_interrupt
Accesses random memory
> Prior to a782a7e46bb5 "x86/irq: Store irq descriptor in vector array", the
> vector_irq array held irq values, those interested in the irq_desc would
> have to perform their own irq_to_desc() (ie, a radix_tree_lookup of the
> irq_desc_tree). The radix tree is updated in free_desc(), so any
> subsequent lookups would fail.
Yes, that above race has existed forever and the irq_to_desc() check
papered over it. In case that the irq number was reassigned the thing
operated on the wrong descriptor. Pretty bad as well as it fiddles
with the wrong bits. Though it clears the vector ...
The problem is actually in the vector assignment code.
> [001] 22.936764: __assign_irq_vector : cpu 44 : vector=134 -> 0xffff88102a8196f8
No interrupt happened so far. So nothing cleans up the vector on cpu 1
> [044] 61.670267: __assign_irq_vector : cpu 34 : vector=123 -> 0xffff88102a8196f8
Now that moves it from 44 to 34 and ignores that cpu 1 still has the
vector assigned. __assign_irq_vector unconditionally overwrites
data->old_domain, so the bit of cpu 1 is lost ....
I'm staring into the code to figure out a fix ....
Thanks,
tglx
On Wed, 25 Nov 2015, Thomas Gleixner wrote:
> The problem is actually in the vector assignment code.
>
> > [001] 22.936764: __assign_irq_vector : cpu 44 : vector=134 -> 0xffff88102a8196f8
>
> No interrupt happened so far. So nothing cleans up the vector on cpu 1
>
> > [044] 61.670267: __assign_irq_vector : cpu 34 : vector=123 -> 0xffff88102a8196f8
>
> Now that moves it from 44 to 34 and ignores that cpu 1 still has the
> vector assigned. __assign_irq_vector unconditionally overwrites
> data->old_domain, so the bit of cpu 1 is lost ....
>
> I'm staring into the code to figure out a fix ....
Just to figure out that my analysis was completely wrong.
__assign_irq_vector()
{
if (d->move_in_progress)
return -EBUSY;
...
So that cannot happen. Now the question is:
> [001] 22.936764: __assign_irq_vector : cpu 44 : vector=134 -> 0xffff88102a8196f8
So CPU1 sees still data->move_in_progress
[001] 54.636722: smp_irq_move_cleanup_interrupt : data->move_in_progress : vector=145 0xffff88102a8196f8
And why does __assign_irq_vector not see it, but no cleanup vector was
received by cpu1 with data->move_in_progress == 0.
> [044] 61.670267: __assign_irq_vector : cpu 34 : vector=123 -> 0xffff88102a8196f8
Ahhhhh.
__send_cleanup_vector()
{
send_IPI()
move_in_progress = 0;
}
So if CPU1 gets the IPI _BEFORE_ move_in_progress is set to 0, and
does not get another IPI before the next move ..... That has been that
way forever.
Duh. Working on a real fix this time.
Thanks,
tglx
On Wed, 25 Nov 2015, Thomas Gleixner wrote:
> So if CPU1 gets the IPI _BEFORE_ move_in_progress is set to 0, and
> does not get another IPI before the next move ..... That has been that
> way forever.
>
> Duh. Working on a real fix this time.
Here you go. Completely untested of course.
Larger than I hoped for, but the simple fix of just clearing the
move_in_progress flag before sending the IPI does not work because:
CPU0 CPU1 CPU2
data->move_in_progress=0
sendIPI()
set_affinity()
lock_vector() handle_IPI
move_in_progress = 1 lock_vector()
unlock_vector()
move_in_progress == 1
-> no cleanup
So we are back to square one. Now one might think that taking vector
lock prevents that issue:
CPU0 CPU1 CPU2
lock_vector()
data->move_in_progress=0
sendIPI()
unlock_vector()
set_affinity()
assign_irq_vector()
lock_vector() handle_IPI
move_in_progress = 1 lock_vector()
unlock_vector()
move_in_progress == 1
Not really.
So now the solution is:
CPU0 CPU1 CPU2
lock_vector()
data->move_in_progress=0
data->cleanup_mask = data->old_domain
sendIPI()
unlock_vector()
set_affinity()
assign_irq_vector()
lock_vector()
if (move_in_progress ||
!empty(cleanup_mask)) {
unlock_vector()
return -EBUSY; handle_IPI
} lock_vector()
move_in_progress == 0
cpu is set in cleanup mask
->cleanup vector
Looks a bit overkill with the extra cpumask. I tried a simple counter
but that does not work versus cpu unplug as we do not know whether the
outgoing cpu is involved in the cleanup or not. And if the cpu is
involved we starve assign_irq_vector() ....
The upside of this is that we get rid of that atomic allocation in
__send_cleanup_vector().
Brain hurts by now.
Not-Yet-Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/vector.c | 37 ++++++++++++++++---------------------
1 file changed, 16 insertions(+), 21 deletions(-)
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -25,6 +25,7 @@ struct apic_chip_data {
struct irq_cfg cfg;
cpumask_var_t domain;
cpumask_var_t old_domain;
+ cpumask_var_t cleanup_mask;
u8 move_in_progress : 1;
};
@@ -83,7 +84,11 @@ static struct apic_chip_data *alloc_apic
goto out_data;
if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
goto out_domain;
+ if (!zalloc_cpumask_var_node(&data->cleanup_mask, GFP_KERNEL, node))
+ goto out_old;
return data;
+out_old:
+ free_cpumask_var(data->old_domain);
out_domain:
free_cpumask_var(data->domain);
out_data:
@@ -96,6 +101,7 @@ static void free_apic_chip_data(struct a
if (data) {
free_cpumask_var(data->domain);
free_cpumask_var(data->old_domain);
+ free_cpumask_var(data->cleanup_mask);
kfree(data);
}
}
@@ -118,7 +124,7 @@ static int __assign_irq_vector(int irq,
static int current_offset = VECTOR_OFFSET_START % 16;
int cpu, err;
- if (d->move_in_progress)
+ if (d->move_in_progress || !cpumask_empty(d->cleanup_mask))
return -EBUSY;
/* Only try and allocate irqs on cpus that are present */
@@ -511,20 +517,11 @@ static struct irq_chip lapic_controller
#ifdef CONFIG_SMP
static void __send_cleanup_vector(struct apic_chip_data *data)
{
- cpumask_var_t cleanup_mask;
-
- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
- unsigned int i;
-
- for_each_cpu_and(i, data->old_domain, cpu_online_mask)
- apic->send_IPI_mask(cpumask_of(i),
- IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
- apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- free_cpumask_var(cleanup_mask);
- }
+ raw_spin_lock(&vector_lock);
+ cpumask_and(data->cleanup_mask, data->old_domain, cpu_online_mask);
data->move_in_progress = 0;
+ apic->send_IPI_mask(data->cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ raw_spin_unlock(&vector_lock);
}
void send_cleanup_vector(struct irq_cfg *cfg)
@@ -568,14 +565,11 @@ asmlinkage __visible void smp_irq_move_c
goto unlock;
/*
- * Check if the irq migration is in progress. If so, we
- * haven't received the cleanup request yet for this irq.
+ * Nothing to cleanup if irq migration is in progress
+ * or this cpu is not set in the cleanup mask.
*/
- if (data->move_in_progress)
- goto unlock;
-
- if (vector == data->cfg.vector &&
- cpumask_test_cpu(me, data->domain))
+ if (data->move_in_progress ||
+ !cpumask_test_cpu(me, data->cleanup_mask))
goto unlock;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -591,6 +585,7 @@ asmlinkage __visible void smp_irq_move_c
goto unlock;
}
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ cpumask_clear_cpu(me, data->cleanup_mask);
unlock:
raw_spin_unlock(&desc->lock);
}
On 11/25/2015 04:12 PM, Thomas Gleixner wrote:
> On Wed, 25 Nov 2015, Thomas Gleixner wrote:
>> So if CPU1 gets the IPI _BEFORE_ move_in_progress is set to 0, and
>> does not get another IPI before the next move ..... That has been that
>> way forever.
>>
>> Duh. Working on a real fix this time.
>
> Here you go. Completely untested of course.
>
Hi Thomas -- thanks for taking a look! (Well, the analysis looks like
more than *just* a look :)
I'll give the patch a go when I get back in the office next week.
-- Joe
On 2015/11/26 5:12, Thomas Gleixner wrote:
> On Wed, 25 Nov 2015, Thomas Gleixner wrote:
>> So if CPU1 gets the IPI _BEFORE_ move_in_progress is set to 0, and
>> does not get another IPI before the next move ..... That has been that
>> way forever.
>>
>> Duh. Working on a real fix this time.
>
> Here you go. Completely untested of course.
>
> Larger than I hoped for, but the simple fix of just clearing the
> move_in_progress flag before sending the IPI does not work because:
>
> CPU0 CPU1 CPU2
> data->move_in_progress=0
> sendIPI()
> set_affinity()
> lock_vector() handle_IPI
> move_in_progress = 1 lock_vector()
> unlock_vector()
> move_in_progress == 1
> -> no cleanup
>
> So we are back to square one. Now one might think that taking vector
> lock prevents that issue:
>
> CPU0 CPU1 CPU2
> lock_vector()
> data->move_in_progress=0
> sendIPI()
> unlock_vector()
> set_affinity()
> assign_irq_vector()
> lock_vector() handle_IPI
> move_in_progress = 1 lock_vector()
> unlock_vector()
> move_in_progress == 1
> Not really.
>
> So now the solution is:
>
> CPU0 CPU1 CPU2
> lock_vector()
> data->move_in_progress=0
> data->cleanup_mask = data->old_domain
> sendIPI()
> unlock_vector()
> set_affinity()
> assign_irq_vector()
> lock_vector()
> if (move_in_progress ||
> !empty(cleanup_mask)) {
> unlock_vector()
> return -EBUSY; handle_IPI
> } lock_vector()
> move_in_progress == 0
> cpu is set in cleanup mask
> ->cleanup vector
>
> Looks a bit overkill with the extra cpumask. I tried a simple counter
> but that does not work versus cpu unplug as we do not know whether the
> outgoing cpu is involved in the cleanup or not. And if the cpu is
> involved we starve assign_irq_vector() ....
>
> The upside of this is that we get rid of that atomic allocation in
> __send_cleanup_vector().
Hi Thomas,
Maybe more headache for you now:)
It seems there are still rooms for improvements. First it
seems we could just reuse old_domain instead of adding cleanup_mask.
Second I found another race window among x86_vector_free_irqs(),
__send_cleanup_vector() and smp_irq_move_cleanup_interrupt().
I'm trying to refine your patch based following rules:
1) move_in_progress controls whether we need to send IPIs
2) old_domain controls which CPUs we should do clean up
3) assign_irq_vector checks both move_in_progress and old_domain.
Will send out the patch soon for comments:)
Thanks,
Gerry
>
> Brain hurts by now.
>
> Not-Yet-Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> arch/x86/kernel/apic/vector.c | 37 ++++++++++++++++---------------------
> 1 file changed, 16 insertions(+), 21 deletions(-)
>
> --- a/arch/x86/kernel/apic/vector.c
> +++ b/arch/x86/kernel/apic/vector.c
> @@ -25,6 +25,7 @@ struct apic_chip_data {
> struct irq_cfg cfg;
> cpumask_var_t domain;
> cpumask_var_t old_domain;
> + cpumask_var_t cleanup_mask;
> u8 move_in_progress : 1;
> };
>
> @@ -83,7 +84,11 @@ static struct apic_chip_data *alloc_apic
> goto out_data;
> if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
> goto out_domain;
> + if (!zalloc_cpumask_var_node(&data->cleanup_mask, GFP_KERNEL, node))
> + goto out_old;
> return data;
> +out_old:
> + free_cpumask_var(data->old_domain);
> out_domain:
> free_cpumask_var(data->domain);
> out_data:
> @@ -96,6 +101,7 @@ static void free_apic_chip_data(struct a
> if (data) {
> free_cpumask_var(data->domain);
> free_cpumask_var(data->old_domain);
> + free_cpumask_var(data->cleanup_mask);
> kfree(data);
> }
> }
> @@ -118,7 +124,7 @@ static int __assign_irq_vector(int irq,
> static int current_offset = VECTOR_OFFSET_START % 16;
> int cpu, err;
>
> - if (d->move_in_progress)
> + if (d->move_in_progress || !cpumask_empty(d->cleanup_mask))
> return -EBUSY;
>
> /* Only try and allocate irqs on cpus that are present */
> @@ -511,20 +517,11 @@ static struct irq_chip lapic_controller
> #ifdef CONFIG_SMP
> static void __send_cleanup_vector(struct apic_chip_data *data)
> {
> - cpumask_var_t cleanup_mask;
> -
> - if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
> - unsigned int i;
> -
> - for_each_cpu_and(i, data->old_domain, cpu_online_mask)
> - apic->send_IPI_mask(cpumask_of(i),
> - IRQ_MOVE_CLEANUP_VECTOR);
> - } else {
> - cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
> - apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
> - free_cpumask_var(cleanup_mask);
> - }
> + raw_spin_lock(&vector_lock);
> + cpumask_and(data->cleanup_mask, data->old_domain, cpu_online_mask);
> data->move_in_progress = 0;
> + apic->send_IPI_mask(data->cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
> + raw_spin_unlock(&vector_lock);
> }
>
> void send_cleanup_vector(struct irq_cfg *cfg)
> @@ -568,14 +565,11 @@ asmlinkage __visible void smp_irq_move_c
> goto unlock;
>
> /*
> - * Check if the irq migration is in progress. If so, we
> - * haven't received the cleanup request yet for this irq.
> + * Nothing to cleanup if irq migration is in progress
> + * or this cpu is not set in the cleanup mask.
> */
> - if (data->move_in_progress)
> - goto unlock;
> -
> - if (vector == data->cfg.vector &&
> - cpumask_test_cpu(me, data->domain))
> + if (data->move_in_progress ||
> + !cpumask_test_cpu(me, data->cleanup_mask))
> goto unlock;
>
> irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
> @@ -591,6 +585,7 @@ asmlinkage __visible void smp_irq_move_c
> goto unlock;
> }
> __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
> + cpumask_clear_cpu(me, data->cleanup_mask);
> unlock:
> raw_spin_unlock(&desc->lock);
> }
>
On Fri, 27 Nov 2015, Jiang Liu wrote:
Please trim your replies.
> On 2015/11/26 5:12, Thomas Gleixner wrote:
> > Looks a bit overkill with the extra cpumask. I tried a simple counter
> > but that does not work versus cpu unplug as we do not know whether the
> > outgoing cpu is involved in the cleanup or not. And if the cpu is
> > involved we starve assign_irq_vector() ....
> >
> > The upside of this is that we get rid of that atomic allocation in
> > __send_cleanup_vector().
>
> Maybe more headache for you now:)
> It seems there are still rooms for improvements. First it
> seems we could just reuse old_domain instead of adding cleanup_mask.
I really like to get rid of that atomic allocation in
__send_cleanup_vector()
> Second I found another race window among x86_vector_free_irqs(),
> __send_cleanup_vector() and smp_irq_move_cleanup_interrupt().
What's the race there?
> I'm trying to refine your patch based following rules:
> 1) move_in_progress controls whether we need to send IPIs
> 2) old_domain controls which CPUs we should do clean up
> 3) assign_irq_vector checks both move_in_progress and old_domain.
> Will send out the patch soon for comments:)
Sure.
Thanks,
tglx
Function __assign_irq_vector() makes use of apic_chip_data.old_domain
as a temporary buffer, which causes trouble to rollback logic in case of
failure. So use a dedicated temporary buffer for __assign_irq_vector().
Signed-off-by: Jiang Liu <[email protected]>
---
arch/x86/kernel/apic/vector.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 861bc59c8f25..d6ec36b4461e 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -30,7 +30,7 @@ struct apic_chip_data {
struct irq_domain *x86_vector_domain;
static DEFINE_RAW_SPINLOCK(vector_lock);
-static cpumask_var_t vector_cpumask;
+static cpumask_var_t vector_cpumask, used_cpumask;
static struct irq_chip lapic_controller;
#ifdef CONFIG_X86_IO_APIC
static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
@@ -124,6 +124,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
/* Only try and allocate irqs on cpus that are present */
err = -ENOSPC;
cpumask_clear(d->old_domain);
+ cpumask_clear(used_cpumask);
cpu = cpumask_first_and(mask, cpu_online_mask);
while (cpu < nr_cpu_ids) {
int new_cpu, vector, offset;
@@ -157,9 +158,8 @@ next:
}
if (unlikely(current_vector == vector)) {
- cpumask_or(d->old_domain, d->old_domain,
- vector_cpumask);
- cpumask_andnot(vector_cpumask, mask, d->old_domain);
+ cpumask_or(used_cpumask, used_cpumask, vector_cpumask);
+ cpumask_andnot(vector_cpumask, mask, used_cpumask);
cpu = cpumask_first_and(vector_cpumask,
cpu_online_mask);
continue;
@@ -404,6 +404,7 @@ int __init arch_early_irq_init(void)
arch_init_htirq_domain(x86_vector_domain);
BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&used_cpumask, GFP_KERNEL));
return arch_early_ioapic_init();
}
--
1.7.10.4
Enhance __assign_irq_vector() to rollback in case of failure so the
caller doesn't need to explicitly rollback.
Signed-off-by: Jiang Liu <[email protected]>
---
arch/x86/kernel/apic/vector.c | 23 ++++++++++++-----------
1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index d6ec36b4461e..f03957e7c50d 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -117,6 +117,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
static int current_offset = VECTOR_OFFSET_START % 16;
int cpu, err;
+ unsigned int dest = d->cfg.dest_apicid;
if (d->move_in_progress)
return -EBUSY;
@@ -140,11 +141,16 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
* the current in use mask. So cleanup the vector
* allocation for the members that are not used anymore.
*/
+ cpumask_and(used_cpumask, d->domain, vector_cpumask);
+ err = apic->cpu_mask_to_apicid_and(mask, used_cpumask,
+ &dest);
+ if (err)
+ break;
cpumask_andnot(d->old_domain, d->domain,
vector_cpumask);
d->move_in_progress =
cpumask_intersects(d->old_domain, cpu_online_mask);
- cpumask_and(d->domain, d->domain, vector_cpumask);
+ cpumask_copy(d->domain, used_cpumask);
break;
}
@@ -167,11 +173,13 @@ next:
if (test_bit(vector, used_vectors))
goto next;
-
for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
goto next;
}
+ if (apic->cpu_mask_to_apicid_and(mask, vector_cpumask, &dest))
+ goto next;
+
/* Found one! */
current_vector = vector;
current_offset = offset;
@@ -190,8 +198,7 @@ next:
if (!err) {
/* cache destination APIC IDs into cfg->dest_apicid */
- err = apic->cpu_mask_to_apicid_and(mask, d->domain,
- &d->cfg.dest_apicid);
+ d->cfg.dest_apicid = dest;
}
return err;
@@ -493,14 +500,8 @@ static int apic_set_affinity(struct irq_data *irq_data,
return -EINVAL;
err = assign_irq_vector(irq, data, dest);
- if (err) {
- if (assign_irq_vector(irq, data,
- irq_data_get_affinity_mask(irq_data)))
- pr_err("Failed to recover vector for irq %d\n", irq);
- return err;
- }
- return IRQ_SET_MASK_OK;
+ return err ? err : IRQ_SET_MASK_OK;
}
static struct irq_chip lapic_controller = {
--
1.7.10.4
There's a race condition between x86_vector_free_irqs()
{
free_apic_chip_data(irq_data->chip_data);
xxxxx //irq_data->chip_data has been freed, but the pointer
//hasn't been reset yet
irq_domain_reset_irq_data(irq_data);
}
and smp_irq_move_cleanup_interrupt()
{
raw_spin_lock(&vector_lock);
data = apic_chip_data(irq_desc_get_irq_data(desc));
access data->xxxx // may access freed memory
raw_spin_unlock(&desc->lock);
}
, which may cause smp_irq_move_cleanup_interrupt() accesses freed memory.
So use vector_lock to guard all memory free code in x86_vector_free_irqs().
Signed-off-by: Jiang Liu <[email protected]>
---
arch/x86/kernel/apic/vector.c | 20 ++++++++------------
1 file changed, 8 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index f03957e7c50d..57934ef1d032 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -231,23 +231,16 @@ static int assign_irq_vector_policy(int irq, int node,
static void clear_irq_vector(int irq, struct apic_chip_data *data)
{
struct irq_desc *desc;
- unsigned long flags;
- int cpu, vector;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- BUG_ON(!data->cfg.vector);
+ int cpu, vector = data->cfg.vector;
- vector = data->cfg.vector;
+ BUG_ON(!vector);
for_each_cpu_and(cpu, data->domain, cpu_online_mask)
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
-
data->cfg.vector = 0;
cpumask_clear(data->domain);
- if (likely(!data->move_in_progress)) {
- raw_spin_unlock_irqrestore(&vector_lock, flags);
+ if (likely(!data->move_in_progress))
return;
- }
desc = irq_to_desc(irq);
for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
@@ -260,7 +253,7 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
}
}
data->move_in_progress = 0;
- raw_spin_unlock_irqrestore(&vector_lock, flags);
+ cpumask_clear(data->old_domain);
}
void init_irq_alloc_info(struct irq_alloc_info *info,
@@ -282,18 +275,21 @@ static void x86_vector_free_irqs(struct irq_domain *domain,
unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *irq_data;
+ unsigned long flags;
int i;
for (i = 0; i < nr_irqs; i++) {
irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
if (irq_data && irq_data->chip_data) {
+ raw_spin_lock_irqsave(&vector_lock, flags);
clear_irq_vector(virq + i, irq_data->chip_data);
free_apic_chip_data(irq_data->chip_data);
+ irq_domain_reset_irq_data(irq_data);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
#ifdef CONFIG_X86_IO_APIC
if (virq + i < nr_legacy_irqs())
legacy_irq_data[virq + i] = NULL;
#endif
- irq_domain_reset_irq_data(irq_data);
}
}
}
--
1.7.10.4
Joe Lawrence <[email protected]> reported an use after release
issue related to x86 IRQ management code. Please refer to following
link for more information:
https://www.mail-archive.com/[email protected]/msg1026840.html
Thomas pointed out that it's caused by a race condition between
__assign_irq_vector() and __send_cleanup_vector(). Based on Thomas'
draft patch, we solve this race condition by:
1) Use move_in_progress to signal that an IRQ cleanup IPI is needed
2) Use old_domain to save old CPU mask for IRQ cleanup
3) Use vector to protect move_in_progress and old_domain
This bugfix patch also helps to get rid of that atomic allocation in
__send_cleanup_vector().
Signed-off-by: Jiang Liu <[email protected]>
---
arch/x86/kernel/apic/vector.c | 77 ++++++++++++++++++-----------------------
1 file changed, 34 insertions(+), 43 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 57934ef1d032..b63d6f84c0bb 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -117,9 +117,9 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
static int current_offset = VECTOR_OFFSET_START % 16;
int cpu, err;
- unsigned int dest = d->cfg.dest_apicid;
+ unsigned int dest;
- if (d->move_in_progress)
+ if (cpumask_intersects(d->old_domain, cpu_online_mask))
return -EBUSY;
/* Only try and allocate irqs on cpus that are present */
@@ -144,13 +144,12 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
cpumask_and(used_cpumask, d->domain, vector_cpumask);
err = apic->cpu_mask_to_apicid_and(mask, used_cpumask,
&dest);
- if (err)
- break;
- cpumask_andnot(d->old_domain, d->domain,
- vector_cpumask);
- d->move_in_progress =
- cpumask_intersects(d->old_domain, cpu_online_mask);
- cpumask_copy(d->domain, used_cpumask);
+ if (!err) {
+ cpumask_andnot(d->old_domain, d->domain,
+ vector_cpumask);
+ cpumask_copy(d->domain, used_cpumask);
+ d->cfg.dest_apicid = dest;
+ }
break;
}
@@ -183,14 +182,12 @@ next:
/* Found one! */
current_vector = vector;
current_offset = offset;
- if (d->cfg.vector) {
+ if (d->cfg.vector)
cpumask_copy(d->old_domain, d->domain);
- d->move_in_progress =
- cpumask_intersects(d->old_domain, cpu_online_mask);
- }
+ d->cfg.vector = vector;
+ d->cfg.dest_apicid = dest;
for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
- d->cfg.vector = vector;
cpumask_copy(d->domain, vector_cpumask);
err = 0;
break;
@@ -198,7 +195,8 @@ next:
if (!err) {
/* cache destination APIC IDs into cfg->dest_apicid */
- d->cfg.dest_apicid = dest;
+ cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
+ d->move_in_progress = !cpumask_empty(d->old_domain);
}
return err;
@@ -230,7 +228,7 @@ static int assign_irq_vector_policy(int irq, int node,
static void clear_irq_vector(int irq, struct apic_chip_data *data)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
int cpu, vector = data->cfg.vector;
BUG_ON(!vector);
@@ -239,10 +237,6 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
data->cfg.vector = 0;
cpumask_clear(data->domain);
- if (likely(!data->move_in_progress))
- return;
-
- desc = irq_to_desc(irq);
for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
@@ -424,10 +418,13 @@ static void __setup_vector_irq(int cpu)
struct irq_data *idata = irq_desc_get_irq_data(desc);
data = apic_chip_data(idata);
- if (!data || !cpumask_test_cpu(cpu, data->domain))
- continue;
- vector = data->cfg.vector;
- per_cpu(vector_irq, cpu)[vector] = desc;
+ if (data) {
+ cpumask_clear_cpu(cpu, data->old_domain);
+ if (cpumask_test_cpu(cpu, data->domain)) {
+ vector = data->cfg.vector;
+ per_cpu(vector_irq, cpu)[vector] = desc;
+ }
+ }
}
/* Mark the free vectors */
for (vector = 0; vector < NR_VECTORS; ++vector) {
@@ -509,20 +506,17 @@ static struct irq_chip lapic_controller = {
#ifdef CONFIG_SMP
static void __send_cleanup_vector(struct apic_chip_data *data)
{
- cpumask_var_t cleanup_mask;
-
- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
- unsigned int i;
+ unsigned long flags;
- for_each_cpu_and(i, data->old_domain, cpu_online_mask)
- apic->send_IPI_mask(cpumask_of(i),
- IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
- apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- free_cpumask_var(cleanup_mask);
- }
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ if (!data->move_in_progress)
+ goto out_unlock;
data->move_in_progress = 0;
+ cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
+ if (!cpumask_empty(data->old_domain))
+ apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR);
+out_unlock:
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
}
void send_cleanup_vector(struct irq_cfg *cfg)
@@ -566,14 +560,10 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
goto unlock;
/*
- * Check if the irq migration is in progress. If so, we
- * haven't received the cleanup request yet for this irq.
+ * Nothing to cleanup if this cpu is not set
+ * in the old_domain mask.
*/
- if (data->move_in_progress)
- goto unlock;
-
- if (vector == data->cfg.vector &&
- cpumask_test_cpu(me, data->domain))
+ if (!cpumask_test_cpu(me, data->old_domain))
goto unlock;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -589,6 +579,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
goto unlock;
}
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ cpumask_clear_cpu(me, data->old_domain);
unlock:
raw_spin_unlock(&desc->lock);
}
--
1.7.10.4
Trivial cleanups for x86 vector allocation code:
1) reorganize apic_chip_data to optimize for size and cache efficiency
2) avoid redundant calling of irq_to_desc()
3) refine code comments
Signed-off-by: Jiang Liu <[email protected]>
---
arch/x86/kernel/apic/vector.c | 54 ++++++++++++++++++-----------------------
1 file changed, 23 insertions(+), 31 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b63d6f84c0bb..0183c44a13cb 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -23,9 +23,9 @@
struct apic_chip_data {
struct irq_cfg cfg;
+ u8 move_in_progress : 1;
cpumask_var_t domain;
cpumask_var_t old_domain;
- u8 move_in_progress : 1;
};
struct irq_domain *x86_vector_domain;
@@ -38,7 +38,7 @@ static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
void lock_vector_lock(void)
{
- /* Used to the online set of cpus does not change
+ /* Used to ensure that the online set of cpus does not change
* during assign_irq_vector.
*/
raw_spin_lock(&vector_lock);
@@ -100,8 +100,7 @@ static void free_apic_chip_data(struct apic_chip_data *data)
}
}
-static int __assign_irq_vector(int irq, struct apic_chip_data *d,
- const struct cpumask *mask)
+static int assign_irq_vector(struct irq_data *data, const struct cpumask *mask)
{
/*
* NOTE! The local APIC isn't very good at handling
@@ -116,11 +115,15 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
*/
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
static int current_offset = VECTOR_OFFSET_START % 16;
- int cpu, err;
+ int cpu, err = -EBUSY;
+ struct irq_desc *desc = irq_data_to_desc(data);
+ struct apic_chip_data *d = data->chip_data;
unsigned int dest;
+ unsigned long flags;
+ raw_spin_lock_irqsave(&vector_lock, flags);
if (cpumask_intersects(d->old_domain, cpu_online_mask))
- return -EBUSY;
+ goto out;
/* Only try and allocate irqs on cpus that are present */
err = -ENOSPC;
@@ -187,7 +190,7 @@ next:
d->cfg.vector = vector;
d->cfg.dest_apicid = dest;
for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
- per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
+ per_cpu(vector_irq, new_cpu)[vector] = desc;
cpumask_copy(d->domain, vector_cpumask);
err = 0;
break;
@@ -198,37 +201,27 @@ next:
cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
d->move_in_progress = !cpumask_empty(d->old_domain);
}
-
- return err;
-}
-
-static int assign_irq_vector(int irq, struct apic_chip_data *data,
- const struct cpumask *mask)
-{
- int err;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, data, mask);
+out:
raw_spin_unlock_irqrestore(&vector_lock, flags);
+
return err;
}
-static int assign_irq_vector_policy(int irq, int node,
- struct apic_chip_data *data,
+static int assign_irq_vector_policy(struct irq_data *data, int node,
struct irq_alloc_info *info)
{
if (info && info->mask)
- return assign_irq_vector(irq, data, info->mask);
+ return assign_irq_vector(data, info->mask);
if (node != NUMA_NO_NODE &&
- assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
+ assign_irq_vector(data, cpumask_of_node(node)) == 0)
return 0;
- return assign_irq_vector(irq, data, apic->target_cpus());
+ return assign_irq_vector(data, apic->target_cpus());
}
-static void clear_irq_vector(int irq, struct apic_chip_data *data)
+static void clear_irq_vector(struct irq_data *irq_data)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_data_to_desc(irq_data);
+ struct apic_chip_data *data = irq_data->chip_data;
int cpu, vector = data->cfg.vector;
BUG_ON(!vector);
@@ -276,7 +269,7 @@ static void x86_vector_free_irqs(struct irq_domain *domain,
irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
if (irq_data && irq_data->chip_data) {
raw_spin_lock_irqsave(&vector_lock, flags);
- clear_irq_vector(virq + i, irq_data->chip_data);
+ clear_irq_vector(irq_data);
free_apic_chip_data(irq_data->chip_data);
irq_domain_reset_irq_data(irq_data);
raw_spin_unlock_irqrestore(&vector_lock, flags);
@@ -321,7 +314,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
irq_data->chip = &lapic_controller;
irq_data->chip_data = data;
irq_data->hwirq = virq + i;
- err = assign_irq_vector_policy(virq + i, node, data, info);
+ err = assign_irq_vector_policy(irq_data, node, info);
if (err)
goto error;
}
@@ -483,8 +476,7 @@ void apic_ack_edge(struct irq_data *data)
static int apic_set_affinity(struct irq_data *irq_data,
const struct cpumask *dest, bool force)
{
- struct apic_chip_data *data = irq_data->chip_data;
- int err, irq = irq_data->irq;
+ int err;
if (!config_enabled(CONFIG_SMP))
return -EPERM;
@@ -492,7 +484,7 @@ static int apic_set_affinity(struct irq_data *irq_data,
if (!cpumask_intersects(dest, cpu_online_mask))
return -EINVAL;
- err = assign_irq_vector(irq, data, dest);
+ err = assign_irq_vector(irq_data, dest);
return err ? err : IRQ_SET_MASK_OK;
}
--
1.7.10.4
On 11/30/2015 03:09 AM, Jiang Liu wrote:
> Joe Lawrence <[email protected]> reported an use after release
> issue related to x86 IRQ management code. Please refer to following
> link for more information:
> https://www.mail-archive.com/[email protected]/msg1026840.html
>
> Thomas pointed out that it's caused by a race condition between
> __assign_irq_vector() and __send_cleanup_vector(). Based on Thomas'
> draft patch, we solve this race condition by:
> 1) Use move_in_progress to signal that an IRQ cleanup IPI is needed
> 2) Use old_domain to save old CPU mask for IRQ cleanup
> 3) Use vector to protect move_in_progress and old_domain
>
> This bugfix patch also helps to get rid of that atomic allocation in
> __send_cleanup_vector().
>
> Signed-off-by: Jiang Liu <[email protected]>
> ---
[ ... snip ... ]
Jiang, Thomas,
Last night I ran with Jiang's five-patch-set on top of 4.3. Tests
started with regular sysfs device removal of mpt HBAs, then later I
added disk stress (the disks are software RAID1 across the HBAs) .. no
issues.
I'll kick off some tougher surprise device removal tests tonight to
further kick the tires.
-- Joe
On 12/01/2015 05:46 PM, Joe Lawrence wrote:
> On 11/30/2015 03:09 AM, Jiang Liu wrote:
>> Joe Lawrence <[email protected]> reported an use after release
>> issue related to x86 IRQ management code. Please refer to following
>> link for more information:
>> https://www.mail-archive.com/[email protected]/msg1026840.html
>>
>> Thomas pointed out that it's caused by a race condition between
>> __assign_irq_vector() and __send_cleanup_vector(). Based on Thomas'
>> draft patch, we solve this race condition by:
>> 1) Use move_in_progress to signal that an IRQ cleanup IPI is needed
>> 2) Use old_domain to save old CPU mask for IRQ cleanup
>> 3) Use vector to protect move_in_progress and old_domain
>>
>> This bugfix patch also helps to get rid of that atomic allocation in
>> __send_cleanup_vector().
>>
>> Signed-off-by: Jiang Liu <[email protected]>
>> ---
>
> [ ... snip ... ]
>
> Jiang, Thomas,
>
> Last night I ran with Jiang's five-patch-set on top of 4.3. Tests
> started with regular sysfs device removal of mpt HBAs, then later I
> added disk stress (the disks are software RAID1 across the HBAs) .. no
> issues.
>
> I'll kick off some tougher surprise device removal tests tonight to
> further kick the tires.
Testing looked good. Feel to add a Tested-by and/or Reported-by.
Thanks,
-- Joe
On Mon, 7 Dec 2015, Joe Lawrence wrote:
> On 12/01/2015 05:46 PM, Joe Lawrence wrote:
> > On 11/30/2015 03:09 AM, Jiang Liu wrote:
> > > Joe Lawrence <[email protected]> reported an use after release
> > > issue related to x86 IRQ management code. Please refer to following
> > > link for more information:
> > > https://www.mail-archive.com/[email protected]/msg1026840.html
> > >
> > > Thomas pointed out that it's caused by a race condition between
> > > __assign_irq_vector() and __send_cleanup_vector(). Based on Thomas'
> > > draft patch, we solve this race condition by:
> > > 1) Use move_in_progress to signal that an IRQ cleanup IPI is needed
> > > 2) Use old_domain to save old CPU mask for IRQ cleanup
> > > 3) Use vector to protect move_in_progress and old_domain
> > >
> > > This bugfix patch also helps to get rid of that atomic allocation in
> > > __send_cleanup_vector().
> > >
> > > Signed-off-by: Jiang Liu <[email protected]>
> > > ---
> >
> > [ ... snip ... ]
> >
> > Jiang, Thomas,
> >
> > Last night I ran with Jiang's five-patch-set on top of 4.3. Tests
> > started with regular sysfs device removal of mpt HBAs, then later I
> > added disk stress (the disks are software RAID1 across the HBAs) .. no
> > issues.
> >
> > I'll kick off some tougher surprise device removal tests tonight to
> > further kick the tires.
>
> Testing looked good. Feel to add a Tested-by and/or Reported-by.
Ok. Great. I'll pick that lot up and tag it for stable.
Thanks,
tglx
Commit-ID: 6dd7cb991fcbfef55d8bf3d22b8a87f9d5007e20
Gitweb: http://git.kernel.org/tip/6dd7cb991fcbfef55d8bf3d22b8a87f9d5007e20
Author: Jiang Liu <[email protected]>
AuthorDate: Mon, 30 Nov 2015 16:09:26 +0800
Committer: Thomas Gleixner <[email protected]>
CommitDate: Thu, 10 Dec 2015 19:32:07 +0100
x86/irq: Do not reuse struct apic_chip_data.old_domain as temporary buffer
Function __assign_irq_vector() makes use of apic_chip_data.old_domain
as a temporary buffer, which causes trouble to rollback logic in case of
failure. So use a dedicated temporary buffer for __assign_irq_vector().
Fixes: a782a7e46bb5 "x86/irq: Store irq descriptor in vector array"
Reported-and-tested-by: Joe Lawrence <[email protected]>
Signed-off-by: Jiang Liu <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Cc: [email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/vector.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 861bc59..d6ec36b 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -30,7 +30,7 @@ struct apic_chip_data {
struct irq_domain *x86_vector_domain;
static DEFINE_RAW_SPINLOCK(vector_lock);
-static cpumask_var_t vector_cpumask;
+static cpumask_var_t vector_cpumask, used_cpumask;
static struct irq_chip lapic_controller;
#ifdef CONFIG_X86_IO_APIC
static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
@@ -124,6 +124,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
/* Only try and allocate irqs on cpus that are present */
err = -ENOSPC;
cpumask_clear(d->old_domain);
+ cpumask_clear(used_cpumask);
cpu = cpumask_first_and(mask, cpu_online_mask);
while (cpu < nr_cpu_ids) {
int new_cpu, vector, offset;
@@ -157,9 +158,8 @@ next:
}
if (unlikely(current_vector == vector)) {
- cpumask_or(d->old_domain, d->old_domain,
- vector_cpumask);
- cpumask_andnot(vector_cpumask, mask, d->old_domain);
+ cpumask_or(used_cpumask, used_cpumask, vector_cpumask);
+ cpumask_andnot(vector_cpumask, mask, used_cpumask);
cpu = cpumask_first_and(vector_cpumask,
cpu_online_mask);
continue;
@@ -404,6 +404,7 @@ int __init arch_early_irq_init(void)
arch_init_htirq_domain(x86_vector_domain);
BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&used_cpumask, GFP_KERNEL));
return arch_early_ioapic_init();
}
Commit-ID: 4c24cee6b2aeaee3dab896f76fef4fe79d9e4183
Gitweb: http://git.kernel.org/tip/4c24cee6b2aeaee3dab896f76fef4fe79d9e4183
Author: Jiang Liu <[email protected]>
AuthorDate: Mon, 30 Nov 2015 16:09:27 +0800
Committer: Thomas Gleixner <[email protected]>
CommitDate: Thu, 10 Dec 2015 19:32:07 +0100
x86/irq: Enhance __assign_irq_vector() to rollback in case of failure
Enhance __assign_irq_vector() to rollback in case of failure so the
caller doesn't need to explicitly rollback.
Fixes: a782a7e46bb5 "x86/irq: Store irq descriptor in vector array"
Reported-and-tested-by: Joe Lawrence <[email protected]>
Signed-off-by: Jiang Liu <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/vector.c | 23 ++++++++++++-----------
1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index d6ec36b..f03957e 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -117,6 +117,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
static int current_offset = VECTOR_OFFSET_START % 16;
int cpu, err;
+ unsigned int dest = d->cfg.dest_apicid;
if (d->move_in_progress)
return -EBUSY;
@@ -140,11 +141,16 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
* the current in use mask. So cleanup the vector
* allocation for the members that are not used anymore.
*/
+ cpumask_and(used_cpumask, d->domain, vector_cpumask);
+ err = apic->cpu_mask_to_apicid_and(mask, used_cpumask,
+ &dest);
+ if (err)
+ break;
cpumask_andnot(d->old_domain, d->domain,
vector_cpumask);
d->move_in_progress =
cpumask_intersects(d->old_domain, cpu_online_mask);
- cpumask_and(d->domain, d->domain, vector_cpumask);
+ cpumask_copy(d->domain, used_cpumask);
break;
}
@@ -167,11 +173,13 @@ next:
if (test_bit(vector, used_vectors))
goto next;
-
for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
goto next;
}
+ if (apic->cpu_mask_to_apicid_and(mask, vector_cpumask, &dest))
+ goto next;
+
/* Found one! */
current_vector = vector;
current_offset = offset;
@@ -190,8 +198,7 @@ next:
if (!err) {
/* cache destination APIC IDs into cfg->dest_apicid */
- err = apic->cpu_mask_to_apicid_and(mask, d->domain,
- &d->cfg.dest_apicid);
+ d->cfg.dest_apicid = dest;
}
return err;
@@ -493,14 +500,8 @@ static int apic_set_affinity(struct irq_data *irq_data,
return -EINVAL;
err = assign_irq_vector(irq, data, dest);
- if (err) {
- if (assign_irq_vector(irq, data,
- irq_data_get_affinity_mask(irq_data)))
- pr_err("Failed to recover vector for irq %d\n", irq);
- return err;
- }
- return IRQ_SET_MASK_OK;
+ return err ? err : IRQ_SET_MASK_OK;
}
static struct irq_chip lapic_controller = {
Commit-ID: 21a1b3bf35018b446c943c15f0a6225e6f6497ae
Gitweb: http://git.kernel.org/tip/21a1b3bf35018b446c943c15f0a6225e6f6497ae
Author: Jiang Liu <[email protected]>
AuthorDate: Mon, 30 Nov 2015 16:09:28 +0800
Committer: Thomas Gleixner <[email protected]>
CommitDate: Thu, 10 Dec 2015 19:32:07 +0100
x86/irq: Fix a race window in x86_vector_free_irqs()
There's a race condition between x86_vector_free_irqs()
{
free_apic_chip_data(irq_data->chip_data);
xxxxx //irq_data->chip_data has been freed, but the pointer
//hasn't been reset yet
irq_domain_reset_irq_data(irq_data);
}
and smp_irq_move_cleanup_interrupt()
{
raw_spin_lock(&vector_lock);
data = apic_chip_data(irq_desc_get_irq_data(desc));
access data->xxxx // may access freed memory
raw_spin_unlock(&desc->lock);
}
, which may cause smp_irq_move_cleanup_interrupt() accesses freed memory.
So use vector_lock to guard all memory free code in x86_vector_free_irqs().
Fixes: a782a7e46bb5 "x86/irq: Store irq descriptor in vector array"
Reported-and-tested-by: Joe Lawrence <[email protected]>
Signed-off-by: Jiang Liu <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/vector.c | 20 ++++++++------------
1 file changed, 8 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index f03957e..57934ef 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -231,23 +231,16 @@ static int assign_irq_vector_policy(int irq, int node,
static void clear_irq_vector(int irq, struct apic_chip_data *data)
{
struct irq_desc *desc;
- unsigned long flags;
- int cpu, vector;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- BUG_ON(!data->cfg.vector);
+ int cpu, vector = data->cfg.vector;
- vector = data->cfg.vector;
+ BUG_ON(!vector);
for_each_cpu_and(cpu, data->domain, cpu_online_mask)
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
-
data->cfg.vector = 0;
cpumask_clear(data->domain);
- if (likely(!data->move_in_progress)) {
- raw_spin_unlock_irqrestore(&vector_lock, flags);
+ if (likely(!data->move_in_progress))
return;
- }
desc = irq_to_desc(irq);
for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
@@ -260,7 +253,7 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
}
}
data->move_in_progress = 0;
- raw_spin_unlock_irqrestore(&vector_lock, flags);
+ cpumask_clear(data->old_domain);
}
void init_irq_alloc_info(struct irq_alloc_info *info,
@@ -282,18 +275,21 @@ static void x86_vector_free_irqs(struct irq_domain *domain,
unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *irq_data;
+ unsigned long flags;
int i;
for (i = 0; i < nr_irqs; i++) {
irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
if (irq_data && irq_data->chip_data) {
+ raw_spin_lock_irqsave(&vector_lock, flags);
clear_irq_vector(virq + i, irq_data->chip_data);
free_apic_chip_data(irq_data->chip_data);
+ irq_domain_reset_irq_data(irq_data);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
#ifdef CONFIG_X86_IO_APIC
if (virq + i < nr_legacy_irqs())
legacy_irq_data[virq + i] = NULL;
#endif
- irq_domain_reset_irq_data(irq_data);
}
}
}
Commit-ID: 41c7518a5d14543fa4aa1b5b9994ac26b38c0406
Gitweb: http://git.kernel.org/tip/41c7518a5d14543fa4aa1b5b9994ac26b38c0406
Author: Jiang Liu <[email protected]>
AuthorDate: Mon, 30 Nov 2015 16:09:29 +0800
Committer: Thomas Gleixner <[email protected]>
CommitDate: Thu, 10 Dec 2015 19:32:07 +0100
x86/irq: Fix a race condition between vector assigning and cleanup
Joe Lawrence reported an use after release issue related to x86 IRQ
management code. Please refer to the following link for more
information: http://lkml.kernel.org/r/[email protected]
Thomas pointed out that it's caused by a race condition between
__assign_irq_vector() and __send_cleanup_vector(). Based on Thomas'
draft patch, we solve this race condition by:
1) Use move_in_progress to signal that an IRQ cleanup IPI is needed
2) Use old_domain to save old CPU mask for IRQ cleanup
3) Use vector to protect move_in_progress and old_domain
This bugfix patch also helps to get rid of that atomic allocation in
__send_cleanup_vector().
Fixes: a782a7e46bb5 "x86/irq: Store irq descriptor in vector array"
Reported-and-tested-by: Joe Lawrence <[email protected]>
Signed-off-by: Jiang Liu <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/vector.c | 77 +++++++++++++++++++------------------------
1 file changed, 34 insertions(+), 43 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 57934ef..b63d6f8 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -117,9 +117,9 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
static int current_offset = VECTOR_OFFSET_START % 16;
int cpu, err;
- unsigned int dest = d->cfg.dest_apicid;
+ unsigned int dest;
- if (d->move_in_progress)
+ if (cpumask_intersects(d->old_domain, cpu_online_mask))
return -EBUSY;
/* Only try and allocate irqs on cpus that are present */
@@ -144,13 +144,12 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
cpumask_and(used_cpumask, d->domain, vector_cpumask);
err = apic->cpu_mask_to_apicid_and(mask, used_cpumask,
&dest);
- if (err)
- break;
- cpumask_andnot(d->old_domain, d->domain,
- vector_cpumask);
- d->move_in_progress =
- cpumask_intersects(d->old_domain, cpu_online_mask);
- cpumask_copy(d->domain, used_cpumask);
+ if (!err) {
+ cpumask_andnot(d->old_domain, d->domain,
+ vector_cpumask);
+ cpumask_copy(d->domain, used_cpumask);
+ d->cfg.dest_apicid = dest;
+ }
break;
}
@@ -183,14 +182,12 @@ next:
/* Found one! */
current_vector = vector;
current_offset = offset;
- if (d->cfg.vector) {
+ if (d->cfg.vector)
cpumask_copy(d->old_domain, d->domain);
- d->move_in_progress =
- cpumask_intersects(d->old_domain, cpu_online_mask);
- }
+ d->cfg.vector = vector;
+ d->cfg.dest_apicid = dest;
for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
- d->cfg.vector = vector;
cpumask_copy(d->domain, vector_cpumask);
err = 0;
break;
@@ -198,7 +195,8 @@ next:
if (!err) {
/* cache destination APIC IDs into cfg->dest_apicid */
- d->cfg.dest_apicid = dest;
+ cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
+ d->move_in_progress = !cpumask_empty(d->old_domain);
}
return err;
@@ -230,7 +228,7 @@ static int assign_irq_vector_policy(int irq, int node,
static void clear_irq_vector(int irq, struct apic_chip_data *data)
{
- struct irq_desc *desc;
+ struct irq_desc *desc = irq_to_desc(irq);
int cpu, vector = data->cfg.vector;
BUG_ON(!vector);
@@ -239,10 +237,6 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
data->cfg.vector = 0;
cpumask_clear(data->domain);
- if (likely(!data->move_in_progress))
- return;
-
- desc = irq_to_desc(irq);
for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
@@ -424,10 +418,13 @@ static void __setup_vector_irq(int cpu)
struct irq_data *idata = irq_desc_get_irq_data(desc);
data = apic_chip_data(idata);
- if (!data || !cpumask_test_cpu(cpu, data->domain))
- continue;
- vector = data->cfg.vector;
- per_cpu(vector_irq, cpu)[vector] = desc;
+ if (data) {
+ cpumask_clear_cpu(cpu, data->old_domain);
+ if (cpumask_test_cpu(cpu, data->domain)) {
+ vector = data->cfg.vector;
+ per_cpu(vector_irq, cpu)[vector] = desc;
+ }
+ }
}
/* Mark the free vectors */
for (vector = 0; vector < NR_VECTORS; ++vector) {
@@ -509,20 +506,17 @@ static struct irq_chip lapic_controller = {
#ifdef CONFIG_SMP
static void __send_cleanup_vector(struct apic_chip_data *data)
{
- cpumask_var_t cleanup_mask;
-
- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
- unsigned int i;
+ unsigned long flags;
- for_each_cpu_and(i, data->old_domain, cpu_online_mask)
- apic->send_IPI_mask(cpumask_of(i),
- IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
- apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- free_cpumask_var(cleanup_mask);
- }
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ if (!data->move_in_progress)
+ goto out_unlock;
data->move_in_progress = 0;
+ cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
+ if (!cpumask_empty(data->old_domain))
+ apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR);
+out_unlock:
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
}
void send_cleanup_vector(struct irq_cfg *cfg)
@@ -566,14 +560,10 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
goto unlock;
/*
- * Check if the irq migration is in progress. If so, we
- * haven't received the cleanup request yet for this irq.
+ * Nothing to cleanup if this cpu is not set
+ * in the old_domain mask.
*/
- if (data->move_in_progress)
- goto unlock;
-
- if (vector == data->cfg.vector &&
- cpumask_test_cpu(me, data->domain))
+ if (!cpumask_test_cpu(me, data->old_domain))
goto unlock;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -589,6 +579,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
goto unlock;
}
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ cpumask_clear_cpu(me, data->old_domain);
unlock:
raw_spin_unlock(&desc->lock);
}
Commit-ID: 27dd9e6098141a9ebaafe48d50277fcae6e09775
Gitweb: http://git.kernel.org/tip/27dd9e6098141a9ebaafe48d50277fcae6e09775
Author: Jiang Liu <[email protected]>
AuthorDate: Mon, 30 Nov 2015 16:09:30 +0800
Committer: Thomas Gleixner <[email protected]>
CommitDate: Thu, 10 Dec 2015 19:39:57 +0100
x86/irq: Trivial cleanups for x86 vector allocation code
Trivial cleanups for x86 vector allocation code:
1) reorganize apic_chip_data to optimize for size and cache efficiency
2) avoid redundant calling of irq_to_desc()
3) refine code comments
Signed-off-by: Jiang Liu <[email protected]>
Cc: Joe Lawrence <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/vector.c | 54 ++++++++++++++++++-------------------------
1 file changed, 23 insertions(+), 31 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b63d6f8..0183c44 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -23,9 +23,9 @@
struct apic_chip_data {
struct irq_cfg cfg;
+ u8 move_in_progress : 1;
cpumask_var_t domain;
cpumask_var_t old_domain;
- u8 move_in_progress : 1;
};
struct irq_domain *x86_vector_domain;
@@ -38,7 +38,7 @@ static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
void lock_vector_lock(void)
{
- /* Used to the online set of cpus does not change
+ /* Used to ensure that the online set of cpus does not change
* during assign_irq_vector.
*/
raw_spin_lock(&vector_lock);
@@ -100,8 +100,7 @@ static void free_apic_chip_data(struct apic_chip_data *data)
}
}
-static int __assign_irq_vector(int irq, struct apic_chip_data *d,
- const struct cpumask *mask)
+static int assign_irq_vector(struct irq_data *data, const struct cpumask *mask)
{
/*
* NOTE! The local APIC isn't very good at handling
@@ -116,11 +115,15 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d,
*/
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
static int current_offset = VECTOR_OFFSET_START % 16;
- int cpu, err;
+ int cpu, err = -EBUSY;
+ struct irq_desc *desc = irq_data_to_desc(data);
+ struct apic_chip_data *d = data->chip_data;
unsigned int dest;
+ unsigned long flags;
+ raw_spin_lock_irqsave(&vector_lock, flags);
if (cpumask_intersects(d->old_domain, cpu_online_mask))
- return -EBUSY;
+ goto out;
/* Only try and allocate irqs on cpus that are present */
err = -ENOSPC;
@@ -187,7 +190,7 @@ next:
d->cfg.vector = vector;
d->cfg.dest_apicid = dest;
for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
- per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
+ per_cpu(vector_irq, new_cpu)[vector] = desc;
cpumask_copy(d->domain, vector_cpumask);
err = 0;
break;
@@ -198,37 +201,27 @@ next:
cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
d->move_in_progress = !cpumask_empty(d->old_domain);
}
-
- return err;
-}
-
-static int assign_irq_vector(int irq, struct apic_chip_data *data,
- const struct cpumask *mask)
-{
- int err;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, data, mask);
+out:
raw_spin_unlock_irqrestore(&vector_lock, flags);
+
return err;
}
-static int assign_irq_vector_policy(int irq, int node,
- struct apic_chip_data *data,
+static int assign_irq_vector_policy(struct irq_data *data, int node,
struct irq_alloc_info *info)
{
if (info && info->mask)
- return assign_irq_vector(irq, data, info->mask);
+ return assign_irq_vector(data, info->mask);
if (node != NUMA_NO_NODE &&
- assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
+ assign_irq_vector(data, cpumask_of_node(node)) == 0)
return 0;
- return assign_irq_vector(irq, data, apic->target_cpus());
+ return assign_irq_vector(data, apic->target_cpus());
}
-static void clear_irq_vector(int irq, struct apic_chip_data *data)
+static void clear_irq_vector(struct irq_data *irq_data)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_data_to_desc(irq_data);
+ struct apic_chip_data *data = irq_data->chip_data;
int cpu, vector = data->cfg.vector;
BUG_ON(!vector);
@@ -276,7 +269,7 @@ static void x86_vector_free_irqs(struct irq_domain *domain,
irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
if (irq_data && irq_data->chip_data) {
raw_spin_lock_irqsave(&vector_lock, flags);
- clear_irq_vector(virq + i, irq_data->chip_data);
+ clear_irq_vector(irq_data);
free_apic_chip_data(irq_data->chip_data);
irq_domain_reset_irq_data(irq_data);
raw_spin_unlock_irqrestore(&vector_lock, flags);
@@ -321,7 +314,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
irq_data->chip = &lapic_controller;
irq_data->chip_data = data;
irq_data->hwirq = virq + i;
- err = assign_irq_vector_policy(virq + i, node, data, info);
+ err = assign_irq_vector_policy(irq_data, node, info);
if (err)
goto error;
}
@@ -483,8 +476,7 @@ void apic_ack_edge(struct irq_data *data)
static int apic_set_affinity(struct irq_data *irq_data,
const struct cpumask *dest, bool force)
{
- struct apic_chip_data *data = irq_data->chip_data;
- int err, irq = irq_data->irq;
+ int err;
if (!config_enabled(CONFIG_SMP))
return -EPERM;
@@ -492,7 +484,7 @@ static int apic_set_affinity(struct irq_data *irq_data,
if (!cpumask_intersects(dest, cpu_online_mask))
return -EINVAL;
- err = assign_irq_vector(irq, data, dest);
+ err = assign_irq_vector(irq_data, dest);
return err ? err : IRQ_SET_MASK_OK;
}