2018-11-24 08:42:31

by Jürgen Groß

[permalink] [raw]
Subject: [PATCH] xen/x86: add diagnostic printout to xen_mc_flush() in case of error

Failure of an element of a Xen multicall is signalled via a WARN()
only unless the kernel is compiled with MC_DEBUG. It is impossible to
know which element failed and why it did so.

Change that by printing the related information even without MC_DEBUG,
even if maybe in some limited form (e.g. without information which
caller produced the failing element).

Move the printing out of the switch statement in order to have the
same information for a single call.

Signed-off-by: Juergen Gross <[email protected]>
---
arch/x86/xen/multicalls.c | 35 ++++++++++++++++++++---------------
1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 2bce7958ce8b..0766a08bdf45 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -69,6 +69,11 @@ void xen_mc_flush(void)

trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);

+#if MC_DEBUG
+ memcpy(b->debug, b->entries,
+ b->mcidx * sizeof(struct multicall_entry));
+#endif
+
switch (b->mcidx) {
case 0:
/* no-op */
@@ -87,32 +92,34 @@ void xen_mc_flush(void)
break;

default:
-#if MC_DEBUG
- memcpy(b->debug, b->entries,
- b->mcidx * sizeof(struct multicall_entry));
-#endif
-
if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
BUG();
for (i = 0; i < b->mcidx; i++)
if (b->entries[i].result < 0)
ret++;
+ }

+ if (WARN_ON(ret)) {
+ pr_err("%d of %d multicall(s) failed: cpu %d\n",
+ ret, b->mcidx, smp_processor_id());
+ for (i = 0; i < b->mcidx; i++) {
+ if (b->entries[i].result < 0) {
#if MC_DEBUG
- if (ret) {
- printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
- ret, smp_processor_id());
- dump_stack();
- for (i = 0; i < b->mcidx; i++) {
- printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
- i+1, b->mcidx,
+ pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pF\n",
+ i + 1,
b->debug[i].op,
b->debug[i].args[0],
b->entries[i].result,
b->caller[i]);
+#else
+ pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\n",
+ i + 1,
+ b->entries[i].op,
+ b->entries[i].args[0],
+ b->entries[i].result);
+#endif
}
}
-#endif
}

b->mcidx = 0;
@@ -126,8 +133,6 @@ void xen_mc_flush(void)
b->cbidx = 0;

local_irq_restore(flags);
-
- WARN_ON(ret);
}

struct multicall_space __xen_mc_entry(size_t args)
--
2.16.4



2018-11-26 20:12:51

by Boris Ostrovsky

[permalink] [raw]
Subject: Re: [PATCH] xen/x86: add diagnostic printout to xen_mc_flush() in case of error

On 11/23/18 11:24 AM, Juergen Gross wrote:
> Failure of an element of a Xen multicall is signalled via a WARN()
> only unless the kernel is compiled with MC_DEBUG. It is impossible to

s/unless/if


> know which element failed and why it did so.
>
> Change that by printing the related information even without MC_DEBUG,
> even if maybe in some limited form (e.g. without information which
> caller produced the failing element).
>
> Move the printing out of the switch statement in order to have the
> same information for a single call.
>
> Signed-off-by: Juergen Gross <[email protected]>
> ---
> arch/x86/xen/multicalls.c | 35 ++++++++++++++++++++---------------
> 1 file changed, 20 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
> index 2bce7958ce8b..0766a08bdf45 100644
> --- a/arch/x86/xen/multicalls.c
> +++ b/arch/x86/xen/multicalls.c
> @@ -69,6 +69,11 @@ void xen_mc_flush(void)
>
> trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
>
> +#if MC_DEBUG
> + memcpy(b->debug, b->entries,
> + b->mcidx * sizeof(struct multicall_entry));
> +#endif
> +
> switch (b->mcidx) {
> case 0:
> /* no-op */
> @@ -87,32 +92,34 @@ void xen_mc_flush(void)
> break;
>
> default:
> -#if MC_DEBUG
> - memcpy(b->debug, b->entries,
> - b->mcidx * sizeof(struct multicall_entry));
> -#endif
> -
> if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
> BUG();
> for (i = 0; i < b->mcidx; i++)
> if (b->entries[i].result < 0)
> ret++;
> + }
>
> + if (WARN_ON(ret)) {
> + pr_err("%d of %d multicall(s) failed: cpu %d\n",
> + ret, b->mcidx, smp_processor_id());
> + for (i = 0; i < b->mcidx; i++) {
> + if (b->entries[i].result < 0) {
> #if MC_DEBUG
> - if (ret) {
> - printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
> - ret, smp_processor_id());
> - dump_stack();
> - for (i = 0; i < b->mcidx; i++) {
> - printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
> - i+1, b->mcidx,
> + pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pF\n",
> + i + 1,
> b->debug[i].op,
> b->debug[i].args[0],
> b->entries[i].result,
> b->caller[i]);
> +#else
> + pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\n",
> + i + 1,
> + b->entries[i].op,
> + b->entries[i].args[0],
> + b->entries[i].result);
> +#endif

Doesn't (non-debug) hypervisor corrupt op and args?

(Also, we don't really need to print anything when b->entries[i].result
== 0)


-boris


> }
> }
> -#endif
> }
>
> b->mcidx = 0;
> @@ -126,8 +133,6 @@ void xen_mc_flush(void)
> b->cbidx = 0;
>
> local_irq_restore(flags);
> -
> - WARN_ON(ret);
> }
>
> struct multicall_space __xen_mc_entry(size_t args)


2018-11-27 07:49:01

by Jürgen Groß

[permalink] [raw]
Subject: Re: [PATCH] xen/x86: add diagnostic printout to xen_mc_flush() in case of error

On 26/11/2018 21:11, Boris Ostrovsky wrote:
> On 11/23/18 11:24 AM, Juergen Gross wrote:
>> Failure of an element of a Xen multicall is signalled via a WARN()
>> only unless the kernel is compiled with MC_DEBUG. It is impossible to
>
> s/unless/if
>
>
>> know which element failed and why it did so.
>>
>> Change that by printing the related information even without MC_DEBUG,
>> even if maybe in some limited form (e.g. without information which
>> caller produced the failing element).
>>
>> Move the printing out of the switch statement in order to have the
>> same information for a single call.
>>
>> Signed-off-by: Juergen Gross <[email protected]>
>> ---
>> arch/x86/xen/multicalls.c | 35 ++++++++++++++++++++---------------
>> 1 file changed, 20 insertions(+), 15 deletions(-)
>>
>> diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
>> index 2bce7958ce8b..0766a08bdf45 100644
>> --- a/arch/x86/xen/multicalls.c
>> +++ b/arch/x86/xen/multicalls.c
>> @@ -69,6 +69,11 @@ void xen_mc_flush(void)
>>
>> trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
>>
>> +#if MC_DEBUG
>> + memcpy(b->debug, b->entries,
>> + b->mcidx * sizeof(struct multicall_entry));
>> +#endif
>> +
>> switch (b->mcidx) {
>> case 0:
>> /* no-op */
>> @@ -87,32 +92,34 @@ void xen_mc_flush(void)
>> break;
>>
>> default:
>> -#if MC_DEBUG
>> - memcpy(b->debug, b->entries,
>> - b->mcidx * sizeof(struct multicall_entry));
>> -#endif
>> -
>> if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
>> BUG();
>> for (i = 0; i < b->mcidx; i++)
>> if (b->entries[i].result < 0)
>> ret++;
>> + }
>>
>> + if (WARN_ON(ret)) {
>> + pr_err("%d of %d multicall(s) failed: cpu %d\n",
>> + ret, b->mcidx, smp_processor_id());
>> + for (i = 0; i < b->mcidx; i++) {
>> + if (b->entries[i].result < 0) {
>> #if MC_DEBUG
>> - if (ret) {
>> - printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
>> - ret, smp_processor_id());
>> - dump_stack();
>> - for (i = 0; i < b->mcidx; i++) {
>> - printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
>> - i+1, b->mcidx,
>> + pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pF\n",
>> + i + 1,
>> b->debug[i].op,
>> b->debug[i].args[0],
>> b->entries[i].result,
>> b->caller[i]);
>> +#else
>> + pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\n",
>> + i + 1,
>> + b->entries[i].op,
>> + b->entries[i].args[0],
>> + b->entries[i].result);
>> +#endif
>
> Doesn't (non-debug) hypervisor corrupt op and args?

No. Only debug hypervisor does so.

See my patch (and rather lengthy discussion) on xen-devel:

https://lists.xen.org/archives/html/xen-devel/2018-11/msg02714.html

>
> (Also, we don't really need to print anything when b->entries[i].result
> == 0)

Right. Did you miss the:

+ if (b->entries[i].result < 0) {

above?


Juergen

2018-11-27 14:29:22

by Boris Ostrovsky

[permalink] [raw]
Subject: Re: [PATCH] xen/x86: add diagnostic printout to xen_mc_flush() in case of error

On 11/27/18 2:46 AM, Juergen Gross wrote:
> On 26/11/2018 21:11, Boris Ostrovsky wrote:
>> On 11/23/18 11:24 AM, Juergen Gross wrote:
>>> Failure of an element of a Xen multicall is signalled via a WARN()
>>> only unless the kernel is compiled with MC_DEBUG. It is impossible to
>> s/unless/if
>>
>>
>>> know which element failed and why it did so.
>>>
>>> Change that by printing the related information even without MC_DEBUG,
>>> even if maybe in some limited form (e.g. without information which
>>> caller produced the failing element).
>>>
>>> Move the printing out of the switch statement in order to have the
>>> same information for a single call.
>>>
>>> Signed-off-by: Juergen Gross <[email protected]>
>>> ---
>>> arch/x86/xen/multicalls.c | 35 ++++++++++++++++++++---------------
>>> 1 file changed, 20 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
>>> index 2bce7958ce8b..0766a08bdf45 100644
>>> --- a/arch/x86/xen/multicalls.c
>>> +++ b/arch/x86/xen/multicalls.c
>>> @@ -69,6 +69,11 @@ void xen_mc_flush(void)
>>>
>>> trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
>>>
>>> +#if MC_DEBUG
>>> + memcpy(b->debug, b->entries,
>>> + b->mcidx * sizeof(struct multicall_entry));
>>> +#endif
>>> +
>>> switch (b->mcidx) {
>>> case 0:
>>> /* no-op */
>>> @@ -87,32 +92,34 @@ void xen_mc_flush(void)
>>> break;
>>>
>>> default:
>>> -#if MC_DEBUG
>>> - memcpy(b->debug, b->entries,
>>> - b->mcidx * sizeof(struct multicall_entry));
>>> -#endif
>>> -
>>> if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
>>> BUG();
>>> for (i = 0; i < b->mcidx; i++)
>>> if (b->entries[i].result < 0)
>>> ret++;
>>> + }
>>>
>>> + if (WARN_ON(ret)) {
>>> + pr_err("%d of %d multicall(s) failed: cpu %d\n",
>>> + ret, b->mcidx, smp_processor_id());
>>> + for (i = 0; i < b->mcidx; i++) {
>>> + if (b->entries[i].result < 0) {
>>> #if MC_DEBUG
>>> - if (ret) {
>>> - printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
>>> - ret, smp_processor_id());
>>> - dump_stack();
>>> - for (i = 0; i < b->mcidx; i++) {
>>> - printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
>>> - i+1, b->mcidx,
>>> + pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pF\n",
>>> + i + 1,
>>> b->debug[i].op,
>>> b->debug[i].args[0],
>>> b->entries[i].result,
>>> b->caller[i]);
>>> +#else
>>> + pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\n",
>>> + i + 1,
>>> + b->entries[i].op,
>>> + b->entries[i].args[0],
>>> + b->entries[i].result);
>>> +#endif
>> Doesn't (non-debug) hypervisor corrupt op and args?
> No. Only debug hypervisor does so.
>
> See my patch (and rather lengthy discussion) on xen-devel:
>
> https://lists.xen.org/archives/html/xen-devel/2018-11/msg02714.html

Yes, I saw that later, after I responded to the patch.


>
>> (Also, we don't really need to print anything when b->entries[i].result
>> == 0)
> Right. Did you miss the:

I did.

Reviewed-by: Boris Ostrovsky <[email protected]>

(with commit message fixed)



>
> + if (b->entries[i].result < 0) {
>
> above?
>
>
> Juergen