Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.18 as permitted sender) client-ip=23.128.96.18;
Subject: Re: [Openipmi-developer] [PATCH] x86: Fix MCE error handing when
 kdump is enabled
To:     <minyard@acm.org>
CC:     Corey Minyard <cminyard@mvista.com>, <arnd@arndb.de>,
        <gregkh@linuxfoundation.org>, <linux-kernel@vger.kernel.org>,
        <linfeilong@huawei.com>, <hidehiro.kawai.ez@hitachi.com>,
        <openipmi-developer@lists.sourceforge.net>,
        <liuzhiqiang26@huawei.com>
References: <20200922161311.GQ3674@minyard.net>
 <20200922182940.31843-1-minyard@acm.org> <20200922184332.GT3674@minyard.net>
From:   Wu Bo <wubo40@huawei.com>
Message-ID: <29448f27-12f7-82a1-7483-80471c36d48c@huawei.com>
Date:   Wed, 23 Sep 2020 16:48:31 +0800
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101
 Thunderbird/78.2.2
MIME-Version: 1.0
In-Reply-To: <20200922184332.GT3674@minyard.net>
Content-Type: text/plain; charset="utf-8"; format=flowed
Content-Language: en-US
Content-Transfer-Encoding: 7bit
Precedence: bulk

On 2020/9/23 2:43, Corey Minyard wrote:
> On Tue, Sep 22, 2020 at 01:29:40PM -0500, minyard@acm.org wrote:
>> From: Corey Minyard <cminyard@mvista.com>
>>
>> If kdump is enabled, the handling of shooting down CPUs does not use the
>> RESET_VECTOR irq before trying to use NMIs to shoot down the CPUs.
>>
>> For normal errors that is fine.  MCEs, however, are already running in
>> an NMI, so sending them an NMI won't do anything.  The MCE code is set
>> up to receive the RESET_VECTOR because it disables CPUs, but it won't
>                                              ^ should be "enables irqs"
>> work on the NMI-only case.
>>
>> There is already code in place to scan for the NMI callback being ready,
>> simply call that from the MCE's wait_for_panic() code so it will pick up
>> and handle it if an NMI shootdown is requested.  This required
>> propagating the registers down to wait_for_panic().
>>
>> Signed-off-by: Corey Minyard <cminyard@mvista.com>
>> ---
>> After looking at it a bit, I think this is the proper way to fix the
>> issue, though I'm not an expert on this code so I'm not sure.
>>
>> I have not even tested this patch, I have only compiled it.  But from
>> what I can tell, things waiting in NMIs for a shootdown should call
>> run_crash_ipi_callback() in their wait loop.

Hi,

In my VM (using qemu-kvm), Kump is enabled, used mce-inject injects an 
uncorrectable error. I has an issue with the IPMI driver's panic 
handling running while the other CPUs are sitting in "wait_for_panic()" 
with interrupt on, and IPMI interrupts interfering with the panic 
handling, As a result, IPMI panic hangs for more than 3000 seconds.

After I has patched and tested this patch, the problem of IPMI hangs has 
disappeared. It should be a solution to the problem.


Thanks,

Wu Bo

>>
>>   arch/x86/kernel/cpu/mce/core.c | 67 ++++++++++++++++++++++------------
>>   1 file changed, 44 insertions(+), 23 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
>> index f43a78bde670..3a842b3773b3 100644
>> --- a/arch/x86/kernel/cpu/mce/core.c
>> +++ b/arch/x86/kernel/cpu/mce/core.c
>> @@ -282,20 +282,35 @@ static int fake_panic;
>>   static atomic_t mce_fake_panicked;
>>   
>>   /* Panic in progress. Enable interrupts and wait for final IPI */
>> -static void wait_for_panic(void)
>> +static void wait_for_panic(struct pt_regs *regs)
>>   {
>>   	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
>>   
>>   	preempt_disable();
>>   	local_irq_enable();
>> -	while (timeout-- > 0)
>> +	while (timeout-- > 0) {
>> +		/*
>> +		 * We are in an NMI waiting to be stopped by the
>> +		 * handing processor.  For kdump handling, we need to
>> +		 * be monitoring crash_ipi_issued since that is what
>> +		 * is used for an NMI stop used by kdump.  But we also
>> +		 * need to have interrupts enabled some so that
>> +		 * RESET_VECTOR will interrupt us on a normal
>> +		 * shutdown.
>> +		 */
>> +		local_irq_disable();
>> +		run_crash_ipi_callback(regs);
>> +		local_irq_enable();
>> +
>>   		udelay(1);
>> +	}
>>   	if (panic_timeout == 0)
>>   		panic_timeout = mca_cfg.panic_timeout;
>>   	panic("Panicing machine check CPU died");
>>   }
>>   
>> -static void mce_panic(const char *msg, struct mce *final, char *exp)
>> +static void mce_panic(const char *msg, struct mce *final, char *exp,
>> +		      struct pt_regs *regs)
>>   {
>>   	int apei_err = 0;
>>   	struct llist_node *pending;
>> @@ -306,7 +321,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
>>   		 * Make sure only one CPU runs in machine check panic
>>   		 */
>>   		if (atomic_inc_return(&mce_panicked) > 1)
>> -			wait_for_panic();
>> +			wait_for_panic(regs);
>>   		barrier();
>>   
>>   		bust_spinlocks(1);
>> @@ -817,7 +832,7 @@ static atomic_t mce_callin;
>>   /*
>>    * Check if a timeout waiting for other CPUs happened.
>>    */
>> -static int mce_timed_out(u64 *t, const char *msg)
>> +static int mce_timed_out(u64 *t, const char *msg, struct pt_regs *regs)
>>   {
>>   	/*
>>   	 * The others already did panic for some reason.
>> @@ -827,12 +842,12 @@ static int mce_timed_out(u64 *t, const char *msg)
>>   	 */
>>   	rmb();
>>   	if (atomic_read(&mce_panicked))
>> -		wait_for_panic();
>> +		wait_for_panic(regs);
>>   	if (!mca_cfg.monarch_timeout)
>>   		goto out;
>>   	if ((s64)*t < SPINUNIT) {
>>   		if (mca_cfg.tolerant <= 1)
>> -			mce_panic(msg, NULL, NULL);
>> +			mce_panic(msg, NULL, NULL, regs);
>>   		cpu_missing = 1;
>>   		return 1;
>>   	}
>> @@ -866,7 +881,7 @@ static int mce_timed_out(u64 *t, const char *msg)
>>    * All the spin loops have timeouts; when a timeout happens a CPU
>>    * typically elects itself to be Monarch.
>>    */
>> -static void mce_reign(void)
>> +static void mce_reign(struct pt_regs *regs)
>>   {
>>   	int cpu;
>>   	struct mce *m = NULL;
>> @@ -896,7 +911,7 @@ static void mce_reign(void)
>>   	 * other CPUs.
>>   	 */
>>   	if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
>> -		mce_panic("Fatal machine check", m, msg);
>> +		mce_panic("Fatal machine check", m, msg, regs);
>>   
>>   	/*
>>   	 * For UC somewhere we let the CPU who detects it handle it.
>> @@ -909,7 +924,8 @@ static void mce_reign(void)
>>   	 * source or one CPU is hung. Panic.
>>   	 */
>>   	if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
>> -		mce_panic("Fatal machine check from unknown source", NULL, NULL);
>> +		mce_panic("Fatal machine check from unknown source", NULL, NULL,
>> +			  regs);
>>   
>>   	/*
>>   	 * Now clear all the mces_seen so that they don't reappear on
>> @@ -928,7 +944,7 @@ static atomic_t global_nwo;
>>    * in the entry order.
>>    * TBD double check parallel CPU hotunplug
>>    */
>> -static int mce_start(int *no_way_out)
>> +static int mce_start(int *no_way_out, struct pt_regs *regs)
>>   {
>>   	int order;
>>   	int cpus = num_online_cpus();
>> @@ -949,7 +965,8 @@ static int mce_start(int *no_way_out)
>>   	 */
>>   	while (atomic_read(&mce_callin) != cpus) {
>>   		if (mce_timed_out(&timeout,
>> -				  "Timeout: Not all CPUs entered broadcast exception handler")) {
>> +				  "Timeout: Not all CPUs entered broadcast exception handler",
>> +				  regs)) {
>>   			atomic_set(&global_nwo, 0);
>>   			return -1;
>>   		}
>> @@ -975,7 +992,8 @@ static int mce_start(int *no_way_out)
>>   		 */
>>   		while (atomic_read(&mce_executing) < order) {
>>   			if (mce_timed_out(&timeout,
>> -					  "Timeout: Subject CPUs unable to finish machine check processing")) {
>> +					  "Timeout: Subject CPUs unable to finish machine check processing",
>> +					  regs)) {
>>   				atomic_set(&global_nwo, 0);
>>   				return -1;
>>   			}
>> @@ -995,7 +1013,7 @@ static int mce_start(int *no_way_out)
>>    * Synchronize between CPUs after main scanning loop.
>>    * This invokes the bulk of the Monarch processing.
>>    */
>> -static int mce_end(int order)
>> +static int mce_end(int order, struct pt_regs *regs)
>>   {
>>   	int ret = -1;
>>   	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
>> @@ -1020,12 +1038,13 @@ static int mce_end(int order)
>>   		 */
>>   		while (atomic_read(&mce_executing) <= cpus) {
>>   			if (mce_timed_out(&timeout,
>> -					  "Timeout: Monarch CPU unable to finish machine check processing"))
>> +					  "Timeout: Monarch CPU unable to finish machine check processing",
>> +					  regs))
>>   				goto reset;
>>   			ndelay(SPINUNIT);
>>   		}
>>   
>> -		mce_reign();
>> +		mce_reign(regs);
>>   		barrier();
>>   		ret = 0;
>>   	} else {
>> @@ -1034,7 +1053,8 @@ static int mce_end(int order)
>>   		 */
>>   		while (atomic_read(&mce_executing) != 0) {
>>   			if (mce_timed_out(&timeout,
>> -					  "Timeout: Monarch CPU did not finish machine check processing"))
>> +					  "Timeout: Monarch CPU did not finish machine check processing",
>> +					  regs))
>>   				goto reset;
>>   			ndelay(SPINUNIT);
>>   		}
>> @@ -1286,9 +1306,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
>>   	 */
>>   	if (lmce) {
>>   		if (no_way_out)
>> -			mce_panic("Fatal local machine check", &m, msg);
>> +			mce_panic("Fatal local machine check", &m, msg, regs);
>>   	} else {
>> -		order = mce_start(&no_way_out);
>> +		order = mce_start(&no_way_out, regs);
>>   	}
>>   
>>   	__mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
>> @@ -1301,7 +1321,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
>>   	 * When there's any problem use only local no_way_out state.
>>   	 */
>>   	if (!lmce) {
>> -		if (mce_end(order) < 0)
>> +		if (mce_end(order, regs) < 0)
>>   			no_way_out = worst >= MCE_PANIC_SEVERITY;
>>   	} else {
>>   		/*
>> @@ -1314,7 +1334,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
>>   		 */
>>   		if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
>>   			mce_severity(&m, cfg->tolerant, &msg, true);
>> -			mce_panic("Local fatal machine check!", &m, msg);
>> +			mce_panic("Local fatal machine check!", &m, msg, regs);
>>   		}
>>   	}
>>   
>> @@ -1325,7 +1345,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
>>   	if (cfg->tolerant == 3)
>>   		kill_it = 0;
>>   	else if (no_way_out)
>> -		mce_panic("Fatal machine check on current CPU", &m, msg);
>> +		mce_panic("Fatal machine check on current CPU", &m, msg, regs);
>>   
>>   	if (worst > 0)
>>   		irq_work_queue(&mce_irq_work);
>> @@ -1361,7 +1381,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
>>   		 */
>>   		if (m.kflags & MCE_IN_KERNEL_RECOV) {
>>   			if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
>> -				mce_panic("Failed kernel mode recovery", &m, msg);
>> +				mce_panic("Failed kernel mode recovery", &m,
>> +					  msg, regs);
>>   		}
>>   	}
>>   }
>> -- 
>> 2.17.1
>>
>>
>>
>> _______________________________________________
>> Openipmi-developer mailing list
>> Openipmi-developer@lists.sourceforge.net
>> https://lists.sourceforge.net/lists/listinfo/openipmi-developer
> .
>