Date: Wed, 25 Nov 2015 22:12:20 +0100 (CET)
From: Thomas Gleixner <tglx@linutronix.de>
To: Joe Lawrence <joe.lawrence@stratus.com>
cc: LKML <linux-kernel@vger.kernel.org>, Jiang Liu <jiang.liu@linux.intel.com>,
        x86@kernel.org
Subject: Re: irq_desc use-after-free in smp_irq_move_cleanup_interrupt
In-Reply-To: <alpine.DEB.2.11.1511251958470.12555@nanos>
Message-ID: <alpine.DEB.2.11.1511252112010.12555@nanos>
References: <5653B688.4050809@stratus.com> <alpine.DEB.2.11.1511251851250.12555@nanos> <alpine.DEB.2.11.1511251958470.12555@nanos>
User-Agent: Alpine 2.11 (DEB 23 2013-08-11)
MIME-Version: 1.0
Content-Type: TEXT/PLAIN; charset=US-ASCII
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 5212
Lines: 169

On Wed, 25 Nov 2015, Thomas Gleixner wrote:
> So if CPU1 gets the IPI _BEFORE_ move_in_progress is set to 0, and
> does not get another IPI before the next move ..... That has been that
> way forever.
> 
> Duh. Working on a real fix this time.

Here you go. Completely untested of course.

Larger than I hoped for, but the simple fix of just clearing the
move_in_progress flag before sending the IPI does not work because:

CPU0	    	     	      CPU1			CPU2
data->move_in_progress=0
sendIPI()			
			      set_affinity()
			      lock_vector()		handle_IPI
			      move_in_progress = 1	lock_vector()
			      unlock_vector()
							move_in_progress == 1
							-> no cleanup

So we are back to square one. Now one might think that taking vector
lock prevents that issue:

CPU0	    	     	      CPU1			CPU2
lock_vector()
data->move_in_progress=0
sendIPI()			
unlock_vector()
			      set_affinity()
			      assign_irq_vector()
			      lock_vector()		handle_IPI
			      move_in_progress = 1	lock_vector()
			      unlock_vector()
							move_in_progress == 1
Not really. 

So now the solution is:

CPU0	    	     	      CPU1			CPU2
lock_vector()
data->move_in_progress=0
data->cleanup_mask = data->old_domain
sendIPI()			
unlock_vector()
			      set_affinity()
			      assign_irq_vector()
			      lock_vector()		
			      if (move_in_progress ||
			      	  !empty(cleanup_mask)) {
				 unlock_vector()
			      	 return -EBUSY;		handle_IPI
			      }	 			lock_vector()
							move_in_progress == 0
							cpu is set in cleanup mask
							->cleanup vector

Looks a bit overkill with the extra cpumask. I tried a simple counter
but that does not work versus cpu unplug as we do not know whether the
outgoing cpu is involved in the cleanup or not. And if the cpu is
involved we starve assign_irq_vector() ....

The upside of this is that we get rid of that atomic allocation in
__send_cleanup_vector().

Brain hurts by now. 

Not-Yet-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/vector.c |   37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -25,6 +25,7 @@ struct apic_chip_data {
 	struct irq_cfg		cfg;
 	cpumask_var_t		domain;
 	cpumask_var_t		old_domain;
+	cpumask_var_t		cleanup_mask;
 	u8			move_in_progress : 1;
 };
 
@@ -83,7 +84,11 @@ static struct apic_chip_data *alloc_apic
 		goto out_data;
 	if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
 		goto out_domain;
+	if (!zalloc_cpumask_var_node(&data->cleanup_mask, GFP_KERNEL, node))
+		goto out_old;
 	return data;
+out_old:
+	free_cpumask_var(data->old_domain);
 out_domain:
 	free_cpumask_var(data->domain);
 out_data:
@@ -96,6 +101,7 @@ static void free_apic_chip_data(struct a
 	if (data) {
 		free_cpumask_var(data->domain);
 		free_cpumask_var(data->old_domain);
+		free_cpumask_var(data->cleanup_mask);
 		kfree(data);
 	}
 }
@@ -118,7 +124,7 @@ static int __assign_irq_vector(int irq,
 	static int current_offset = VECTOR_OFFSET_START % 16;
 	int cpu, err;
 
-	if (d->move_in_progress)
+	if (d->move_in_progress || !cpumask_empty(d->cleanup_mask))
 		return -EBUSY;
 
 	/* Only try and allocate irqs on cpus that are present */
@@ -511,20 +517,11 @@ static struct irq_chip lapic_controller
 #ifdef CONFIG_SMP
 static void __send_cleanup_vector(struct apic_chip_data *data)
 {
-	cpumask_var_t cleanup_mask;
-
-	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
-		unsigned int i;
-
-		for_each_cpu_and(i, data->old_domain, cpu_online_mask)
-			apic->send_IPI_mask(cpumask_of(i),
-					    IRQ_MOVE_CLEANUP_VECTOR);
-	} else {
-		cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
-		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-		free_cpumask_var(cleanup_mask);
-	}
+	raw_spin_lock(&vector_lock);
+	cpumask_and(data->cleanup_mask, data->old_domain, cpu_online_mask);
 	data->move_in_progress = 0;
+	apic->send_IPI_mask(data->cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+	raw_spin_unlock(&vector_lock);
 }
 
 void send_cleanup_vector(struct irq_cfg *cfg)
@@ -568,14 +565,11 @@ asmlinkage __visible void smp_irq_move_c
 			goto unlock;
 
 		/*
-		 * Check if the irq migration is in progress. If so, we
-		 * haven't received the cleanup request yet for this irq.
+		 * Nothing to cleanup if irq migration is in progress
+		 * or this cpu is not set in the cleanup mask.
 		 */
-		if (data->move_in_progress)
-			goto unlock;
-
-		if (vector == data->cfg.vector &&
-		    cpumask_test_cpu(me, data->domain))
+		if (data->move_in_progress ||
+		    !cpumask_test_cpu(me, data->cleanup_mask))
 			goto unlock;
 
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -591,6 +585,7 @@ asmlinkage __visible void smp_irq_move_c
 			goto unlock;
 		}
 		__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+		cpumask_clear_cpu(me, data->cleanup_mask);
 unlock:
 		raw_spin_unlock(&desc->lock);
 	}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/