Subject: Re: irq_desc use-after-free in smp_irq_move_cleanup_interrupt
To: Thomas Gleixner <tglx@linutronix.de>,
        Joe Lawrence <joe.lawrence@stratus.com>
References: <5653B688.4050809@stratus.com>
 <alpine.DEB.2.11.1511251851250.12555@nanos>
 <alpine.DEB.2.11.1511251958470.12555@nanos>
 <alpine.DEB.2.11.1511252112010.12555@nanos>
Cc: LKML <linux-kernel@vger.kernel.org>, x86@kernel.org
From: Jiang Liu <jiang.liu@linux.intel.com>
Organization: Intel
Message-ID: <56580F12.8070607@linux.intel.com>
Date: Fri, 27 Nov 2015 16:06:42 +0800
User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0) Gecko/20100101
 Thunderbird/38.3.0
MIME-Version: 1.0
In-Reply-To: <alpine.DEB.2.11.1511252112010.12555@nanos>
Content-Type: text/plain; charset=windows-1252
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 6180
Lines: 187


On 2015/11/26 5:12, Thomas Gleixner wrote:
> On Wed, 25 Nov 2015, Thomas Gleixner wrote:
>> So if CPU1 gets the IPI _BEFORE_ move_in_progress is set to 0, and
>> does not get another IPI before the next move ..... That has been that
>> way forever.
>>
>> Duh. Working on a real fix this time.
> 
> Here you go. Completely untested of course.
> 
> Larger than I hoped for, but the simple fix of just clearing the
> move_in_progress flag before sending the IPI does not work because:
> 
> CPU0	    	     	      CPU1			CPU2
> data->move_in_progress=0
> sendIPI()			
> 			      set_affinity()
> 			      lock_vector()		handle_IPI
> 			      move_in_progress = 1	lock_vector()
> 			      unlock_vector()
> 							move_in_progress == 1
> 							-> no cleanup
> 
> So we are back to square one. Now one might think that taking vector
> lock prevents that issue:
> 
> CPU0	    	     	      CPU1			CPU2
> lock_vector()
> data->move_in_progress=0
> sendIPI()			
> unlock_vector()
> 			      set_affinity()
> 			      assign_irq_vector()
> 			      lock_vector()		handle_IPI
> 			      move_in_progress = 1	lock_vector()
> 			      unlock_vector()
> 							move_in_progress == 1
> Not really. 
> 
> So now the solution is:
> 
> CPU0	    	     	      CPU1			CPU2
> lock_vector()
> data->move_in_progress=0
> data->cleanup_mask = data->old_domain
> sendIPI()			
> unlock_vector()
> 			      set_affinity()
> 			      assign_irq_vector()
> 			      lock_vector()		
> 			      if (move_in_progress ||
> 			      	  !empty(cleanup_mask)) {
> 				 unlock_vector()
> 			      	 return -EBUSY;		handle_IPI
> 			      }	 			lock_vector()
> 							move_in_progress == 0
> 							cpu is set in cleanup mask
> 							->cleanup vector
> 
> Looks a bit overkill with the extra cpumask. I tried a simple counter
> but that does not work versus cpu unplug as we do not know whether the
> outgoing cpu is involved in the cleanup or not. And if the cpu is
> involved we starve assign_irq_vector() ....
> 
> The upside of this is that we get rid of that atomic allocation in
> __send_cleanup_vector().
Hi Thomas,
	Maybe more headache for you now:)
	It seems there are still rooms for improvements. First it
seems we could just reuse old_domain instead of adding cleanup_mask.
Second I found another race window among x86_vector_free_irqs(),
__send_cleanup_vector() and smp_irq_move_cleanup_interrupt().
I'm trying to refine your patch based following rules:
1) move_in_progress controls whether we need to send IPIs
2) old_domain controls which CPUs we should do clean up
3) assign_irq_vector checks both move_in_progress and old_domain.
Will send out the patch soon for comments:)
Thanks,
Gerry			

> 
> Brain hurts by now. 
> 
> Not-Yet-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  arch/x86/kernel/apic/vector.c |   37 ++++++++++++++++---------------------
>  1 file changed, 16 insertions(+), 21 deletions(-)
> 
> --- a/arch/x86/kernel/apic/vector.c
> +++ b/arch/x86/kernel/apic/vector.c
> @@ -25,6 +25,7 @@ struct apic_chip_data {
>  	struct irq_cfg		cfg;
>  	cpumask_var_t		domain;
>  	cpumask_var_t		old_domain;
> +	cpumask_var_t		cleanup_mask;
>  	u8			move_in_progress : 1;
>  };
>  
> @@ -83,7 +84,11 @@ static struct apic_chip_data *alloc_apic
>  		goto out_data;
>  	if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
>  		goto out_domain;
> +	if (!zalloc_cpumask_var_node(&data->cleanup_mask, GFP_KERNEL, node))
> +		goto out_old;
>  	return data;
> +out_old:
> +	free_cpumask_var(data->old_domain);
>  out_domain:
>  	free_cpumask_var(data->domain);
>  out_data:
> @@ -96,6 +101,7 @@ static void free_apic_chip_data(struct a
>  	if (data) {
>  		free_cpumask_var(data->domain);
>  		free_cpumask_var(data->old_domain);
> +		free_cpumask_var(data->cleanup_mask);
>  		kfree(data);
>  	}
>  }
> @@ -118,7 +124,7 @@ static int __assign_irq_vector(int irq,
>  	static int current_offset = VECTOR_OFFSET_START % 16;
>  	int cpu, err;
>  
> -	if (d->move_in_progress)
> +	if (d->move_in_progress || !cpumask_empty(d->cleanup_mask))
>  		return -EBUSY;
>  
>  	/* Only try and allocate irqs on cpus that are present */
> @@ -511,20 +517,11 @@ static struct irq_chip lapic_controller
>  #ifdef CONFIG_SMP
>  static void __send_cleanup_vector(struct apic_chip_data *data)
>  {
> -	cpumask_var_t cleanup_mask;
> -
> -	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
> -		unsigned int i;
> -
> -		for_each_cpu_and(i, data->old_domain, cpu_online_mask)
> -			apic->send_IPI_mask(cpumask_of(i),
> -					    IRQ_MOVE_CLEANUP_VECTOR);
> -	} else {
> -		cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
> -		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
> -		free_cpumask_var(cleanup_mask);
> -	}
> +	raw_spin_lock(&vector_lock);
> +	cpumask_and(data->cleanup_mask, data->old_domain, cpu_online_mask);
>  	data->move_in_progress = 0;
> +	apic->send_IPI_mask(data->cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
> +	raw_spin_unlock(&vector_lock);
>  }
>  
>  void send_cleanup_vector(struct irq_cfg *cfg)
> @@ -568,14 +565,11 @@ asmlinkage __visible void smp_irq_move_c
>  			goto unlock;
>  
>  		/*
> -		 * Check if the irq migration is in progress. If so, we
> -		 * haven't received the cleanup request yet for this irq.
> +		 * Nothing to cleanup if irq migration is in progress
> +		 * or this cpu is not set in the cleanup mask.
>  		 */
> -		if (data->move_in_progress)
> -			goto unlock;
> -
> -		if (vector == data->cfg.vector &&
> -		    cpumask_test_cpu(me, data->domain))
> +		if (data->move_in_progress ||
> +		    !cpumask_test_cpu(me, data->cleanup_mask))
>  			goto unlock;
>  
>  		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
> @@ -591,6 +585,7 @@ asmlinkage __visible void smp_irq_move_c
>  			goto unlock;
>  		}
>  		__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
> +		cpumask_clear_cpu(me, data->cleanup_mask);
>  unlock:
>  		raw_spin_unlock(&desc->lock);
>  	}
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/