Date: Wed, 23 May 2012 12:11:33 +0200
From: Borislav Petkov <bp@amd64.org>
To: Chen Gong <gong.chen@linux.intel.com>
Cc: tony.luck@intel.com, bp@amd64.org, x86@kernel.org,
        linux-kernel@vger.kernel.org
Subject: Re: [PATCH] x86: auto poll/interrupt mode switch for CMC to stop CMC
 storm
Message-ID: <20120523101133.GA13637@aftab.osrc.amd.com>
References: <1337740341-26711-1-git-send-email-gong.chen@linux.intel.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <1337740341-26711-1-git-send-email-gong.chen@linux.intel.com>
User-Agent: Mutt/1.5.21 (2010-09-15)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 6370
Lines: 186

On Wed, May 23, 2012 at 10:32:21AM +0800, Chen Gong wrote:
> This idea is inspired from IA64 implementation. It is like
> NAPI for network stack. When CMCI is too many to handle,
> this interrupt can be disabled and then poll mode will take
> over the events handle. When no more events happen in the
> system, CMC interrupt can be enabled automatically.
> 
> Signed-off-by: Chen Gong <gong.chen@linux.intel.com>
> ---
>  arch/x86/kernel/cpu/mcheck/mce.c |   83 +++++++++++++++++++++++++++++++++++++-
>  1 file changed, 81 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index d086a09..6334f0d 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -92,6 +92,7 @@ static char			*mce_helper_argv[2] = { mce_helper, NULL };
>  
>  static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
>  
> +static DEFINE_PER_CPU(struct timer_list, mce_timer);
>  static DEFINE_PER_CPU(struct mce, mces_seen);
>  static int			cpu_missing;
>  
> @@ -100,8 +101,28 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
>  	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
>  };
>  
> +#define CMC_POLL_INTERVAL	(1 * 30)
> +#define CMC_STORM		5
> +static DEFINE_PER_CPU(int, cmci_storm_warning);
> +static DEFINE_PER_CPU(unsigned long, first_cmci_jiffie);
> +static DEFINE_SPINLOCK(cmc_poll_lock);
> +
> +/*
> + * This variable tells whether we are in cmci-storm-happened mode.
> + * Start with this in the wrong state so we won't play w/ timers
> + * before the system is ready.
> + */
> +static int cmci_storm_detected = 1;
> +
>  static DEFINE_PER_CPU(struct work_struct, mce_work);
>  
> +static void mce_disable_cmci(void *data);
> +static void mce_enable_ce(void *all);
> +static void cmc_disable_keventd(struct work_struct *dummy);
> +static void cmc_enable_keventd(struct work_struct *dummy);
> +
> +static DECLARE_WORK(cmc_disable_work, cmc_disable_keventd);
> +static DECLARE_WORK(cmc_enable_work, cmc_enable_keventd);
>  /*
>   * CPU/chipset specific EDAC code can register a notifier call here to print
>   * MCE errors in a human-readable form.
> @@ -582,6 +603,37 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
>  {
>  	struct mce m;
>  	int i;
> +	unsigned long flag;
> +
> +	spin_lock_irqsave(&cmc_poll_lock, flag);
> +	if (cmci_storm_detected == 0) {
> +		unsigned long now = jiffies;
> +		int *count = &__get_cpu_var(cmci_storm_warning);
> +		unsigned long *history = &__get_cpu_var(first_cmci_jiffie);
> +
> +		if (time_before_eq(now, *history + HZ))
> +			(*count)++;
> +		else {
> +			*count = 0;
> +			*history = now;
> +		}
> +
> +		if (*count >= CMC_STORM) {
> +			cmci_storm_detected = 1;
> +			/* If we're being hit with CMC interrupts, we won't
> +			 * ever execute the schedule_work() below.  Need to
> +			 * disable CMC interrupts on this processor now.
> +			 */
> +			mce_disable_cmci(NULL);
> +			if (!work_pending(&cmc_disable_work))
> +				schedule_work(&cmc_disable_work);
> +			spin_unlock_irqrestore(&cmc_poll_lock, flag);
> +			printk(KERN_WARNING "WARNING: Switching to polling "\
> +				"CMC handler; error records may be lost\n");
> +			goto out;
> +		}
> +	}
> +	spin_unlock_irqrestore(&cmc_poll_lock, flag);
>  
>  	percpu_inc(mce_poll_count);
>  
> @@ -628,6 +680,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
>  		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
>  	}
>  
> +out:
>  	/*
>  	 * Don't clear MCG_STATUS here because it's only defined for
>  	 * exceptions.
> @@ -1199,6 +1252,20 @@ static void mce_process_work(struct work_struct *dummy)
>  		memory_failure(pfn, MCE_VECTOR, 0);
>  }
>  
> +static void cmc_disable_keventd(struct work_struct *dummy)
> +{
> +	struct timer_list *t = __this_cpu_ptr(&mce_timer);
> +
> +	on_each_cpu(mce_disable_cmci, NULL, 0);
> +	mod_timer(t, jiffies + CMC_POLL_INTERVAL * HZ);
> +}
> +
> +static void cmc_enable_keventd(struct work_struct *dummy)
> +{
> +	/* don't re-initiate timer */
> +	on_each_cpu(mce_enable_ce, NULL, 0);
> +}
> +
>  #ifdef CONFIG_X86_MCE_INTEL
>  /***
>   * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
> @@ -1232,12 +1299,12 @@ void mce_log_therm_throt_event(__u64 status)
>  static int check_interval = 5 * 60; /* 5 minutes */
>  
>  static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
> -static DEFINE_PER_CPU(struct timer_list, mce_timer);
>  
>  static void mce_start_timer(unsigned long data)
>  {
>  	struct timer_list *t = &per_cpu(mce_timer, data);
>  	int *n;
> +	unsigned long flags;
>  
>  	WARN_ON(smp_processor_id() != data);
>  
> @@ -1253,8 +1320,19 @@ static void mce_start_timer(unsigned long data)
>  	n = &__get_cpu_var(mce_next_interval);
>  	if (mce_notify_irq())
>  		*n = max(*n/2, HZ/100);
> -	else
> +	else {
>  		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
> +		/* if no CMC event, switch out of polling mode */
> +		spin_lock_irqsave(&cmc_poll_lock, flags);
> +		if (cmci_storm_detected == 1) {
> +			printk(KERN_WARNING "Returning to interrupt driven "\
> +					"CMC handler\n");
> +			if (!work_pending(&cmc_enable_work))
> +				schedule_work(&cmc_enable_work);
> +			cmci_storm_detected = 0;
> +		}
> +		spin_unlock_irqrestore(&cmc_poll_lock, flags);
> +	}
>  
>  	t->expires = jiffies + *n;
>  	add_timer_on(t, smp_processor_id());
> @@ -1547,6 +1625,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
>  	__mcheck_cpu_init_generic();
>  	__mcheck_cpu_init_vendor(c);
>  	__mcheck_cpu_init_timer();
> +	cmci_storm_detected = 0;
>  	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
>  	init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);

This whole CMCI thing is Intel-specific so why don't you add it to
<arch/x86/kernel/cpu/mcheck/mce_intel.c> where the rest of the CMCI
stuff is?

You need only a hook in machine_check_poll() which runs on Intel only
and does the whole temporary CMCI toggling.

Thanks.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/