Hello,
This is a small update of the previously introduced vcpu stall detector
which adds an interrupt to the virtual device to notify the guest VM in
case it stalls. This lets the guest VM to handle the reboot and to
panic in case it expires.
Thanks,
Sebastian Ene (2):
dt-bindings: vcpu_stall_detector: Add a PPI interrupt to the virtual
device
misc: Register a PPI for the vcpu stall detection virtual device
.../misc/qemu,vcpu-stall-detector.yaml | 6 +++
drivers/misc/vcpu_stall_detector.c | 41 ++++++++++++++++++-
2 files changed, 45 insertions(+), 2 deletions(-)
--
2.45.1.288.g0e0cd299f1-goog
Request a PPI for each vCPU during probe which will be used by the host
to communicate a stall detected event on the vCPU. When the host raises
this interrupt from the virtual machine monitor, the guest is expected to
handle the interrupt and panic.
Signed-off-by: Sebastian Ene <[email protected]>
---
drivers/misc/vcpu_stall_detector.c | 41 ++++++++++++++++++++++++++++--
1 file changed, 39 insertions(+), 2 deletions(-)
diff --git a/drivers/misc/vcpu_stall_detector.c b/drivers/misc/vcpu_stall_detector.c
index e2015c87f03f..c580cd7fd225 100644
--- a/drivers/misc/vcpu_stall_detector.c
+++ b/drivers/misc/vcpu_stall_detector.c
@@ -32,6 +32,7 @@
struct vcpu_stall_detect_config {
u32 clock_freq_hz;
u32 stall_timeout_sec;
+ int ppi_irq;
void __iomem *membase;
struct platform_device *dev;
@@ -77,6 +78,12 @@ vcpu_stall_detect_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
}
+static irqreturn_t vcpu_stall_detector_irq(int irq, void *dev)
+{
+ panic("vCPU stall detector");
+ return IRQ_HANDLED;
+}
+
static int start_stall_detector_cpu(unsigned int cpu)
{
u32 ticks, ping_timeout_ms;
@@ -132,7 +139,7 @@ static int stop_stall_detector_cpu(unsigned int cpu)
static int vcpu_stall_detect_probe(struct platform_device *pdev)
{
- int ret;
+ int ret, irq, num_irqs;
struct resource *r;
void __iomem *membase;
u32 clock_freq_hz = VCPU_STALL_DEFAULT_CLOCK_HZ;
@@ -169,9 +176,32 @@ static int vcpu_stall_detect_probe(struct platform_device *pdev)
vcpu_stall_config = (struct vcpu_stall_detect_config) {
.membase = membase,
.clock_freq_hz = clock_freq_hz,
- .stall_timeout_sec = stall_timeout_sec
+ .stall_timeout_sec = stall_timeout_sec,
+ .ppi_irq = -1,
};
+ num_irqs = platform_irq_count(pdev);
+ if (num_irqs < 0) {
+ dev_err(&pdev->dev, "Failed to get irqs\n");
+ ret = num_irqs;
+ goto err;
+ } else if (num_irqs > 1) {
+ dev_err(&pdev->dev, "Multipple irqs detected\n");
+ ret = -EINVAL;
+ goto err;
+ } else if (num_irqs == 1) {
+ irq = platform_get_irq(pdev, 0);
+ if ((irq > 0) && irq_is_percpu_devid(irq)) {
+ ret = request_percpu_irq(irq,
+ vcpu_stall_detector_irq,
+ "vcpu_stall_detector",
+ vcpu_stall_detectors);
+ if (!ret)
+ vcpu_stall_config.ppi_irq = irq;
+
+ }
+ }
+
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"virt/vcpu_stall_detector:online",
start_stall_detector_cpu,
@@ -184,6 +214,9 @@ static int vcpu_stall_detect_probe(struct platform_device *pdev)
vcpu_stall_config.hp_online = ret;
return 0;
err:
+ if (vcpu_stall_config.ppi_irq > 0)
+ free_percpu_irq(vcpu_stall_config.ppi_irq,
+ vcpu_stall_detectors);
return ret;
}
@@ -193,6 +226,10 @@ static void vcpu_stall_detect_remove(struct platform_device *pdev)
cpuhp_remove_state(vcpu_stall_config.hp_online);
+ if (vcpu_stall_config.ppi_irq > 0)
+ free_percpu_irq(vcpu_stall_config.ppi_irq,
+ vcpu_stall_detectors);
+
for_each_possible_cpu(cpu)
stop_stall_detector_cpu(cpu);
}
--
2.45.1.288.g0e0cd299f1-goog
On Thu, May 23, 2024 at 04:04:13PM +0000, Sebastian Ene wrote:
> Request a PPI for each vCPU during probe which will be used by the host
> to communicate a stall detected event on the vCPU. When the host raises
> this interrupt from the virtual machine monitor, the guest is expected to
> handle the interrupt and panic.
>
> Signed-off-by: Sebastian Ene <[email protected]>
> ---
> drivers/misc/vcpu_stall_detector.c | 41 ++++++++++++++++++++++++++++--
> 1 file changed, 39 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/misc/vcpu_stall_detector.c b/drivers/misc/vcpu_stall_detector.c
> index e2015c87f03f..c580cd7fd225 100644
> --- a/drivers/misc/vcpu_stall_detector.c
> +++ b/drivers/misc/vcpu_stall_detector.c
> @@ -32,6 +32,7 @@
> struct vcpu_stall_detect_config {
> u32 clock_freq_hz;
> u32 stall_timeout_sec;
> + int ppi_irq;
>
> void __iomem *membase;
> struct platform_device *dev;
> @@ -77,6 +78,12 @@ vcpu_stall_detect_timer_fn(struct hrtimer *hrtimer)
> return HRTIMER_RESTART;
> }
>
> +static irqreturn_t vcpu_stall_detector_irq(int irq, void *dev)
> +{
> + panic("vCPU stall detector");
> + return IRQ_HANDLED;
> +}
> +
> static int start_stall_detector_cpu(unsigned int cpu)
> {
> u32 ticks, ping_timeout_ms;
> @@ -132,7 +139,7 @@ static int stop_stall_detector_cpu(unsigned int cpu)
>
> static int vcpu_stall_detect_probe(struct platform_device *pdev)
> {
> - int ret;
> + int ret, irq, num_irqs;
> struct resource *r;
> void __iomem *membase;
> u32 clock_freq_hz = VCPU_STALL_DEFAULT_CLOCK_HZ;
> @@ -169,9 +176,32 @@ static int vcpu_stall_detect_probe(struct platform_device *pdev)
> vcpu_stall_config = (struct vcpu_stall_detect_config) {
> .membase = membase,
> .clock_freq_hz = clock_freq_hz,
> - .stall_timeout_sec = stall_timeout_sec
> + .stall_timeout_sec = stall_timeout_sec,
> + .ppi_irq = -1,
> };
>
> + num_irqs = platform_irq_count(pdev);
> + if (num_irqs < 0) {
> + dev_err(&pdev->dev, "Failed to get irqs\n");
platform_irq_count() either returns a number or EPROBE_DEFER, I don't
think emitting an error on deferred probe is the correct thing to do
here?
> + ret = num_irqs;
> + goto err;
> + } else if (num_irqs > 1) {
> + dev_err(&pdev->dev, "Multipple irqs detected\n");
Typo. I don't really see why you're going to this level of complexity
though, why aren't you just doing a single get_irq_optional()?
> + ret = -EINVAL;
> + goto err;
> + } else if (num_irqs == 1) {
> + irq = platform_get_irq(pdev, 0);
> + if ((irq > 0) && irq_is_percpu_devid(irq)) {
> + ret = request_percpu_irq(irq,
> + vcpu_stall_detector_irq,
> + "vcpu_stall_detector",
> + vcpu_stall_detectors);
> + if (!ret)
> + vcpu_stall_config.ppi_irq = irq;
> +
> + }
> + }
> +
> ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> "virt/vcpu_stall_detector:online",
> start_stall_detector_cpu,
> @@ -184,6 +214,9 @@ static int vcpu_stall_detect_probe(struct platform_device *pdev)
> vcpu_stall_config.hp_online = ret;
> return 0;
> err:
> + if (vcpu_stall_config.ppi_irq > 0)
> + free_percpu_irq(vcpu_stall_config.ppi_irq,
> + vcpu_stall_detectors);
> return ret;
> }
>
> @@ -193,6 +226,10 @@ static void vcpu_stall_detect_remove(struct platform_device *pdev)
>
> cpuhp_remove_state(vcpu_stall_config.hp_online);
>
> + if (vcpu_stall_config.ppi_irq > 0)
> + free_percpu_irq(vcpu_stall_config.ppi_irq,
> + vcpu_stall_detectors);
> +
> for_each_possible_cpu(cpu)
> stop_stall_detector_cpu(cpu);
> }
> --
> 2.45.1.288.g0e0cd299f1-goog
>
>
On Fri, May 24, 2024 at 08:00:42PM +0100, Conor Dooley wrote:
> On Thu, May 23, 2024 at 04:04:13PM +0000, Sebastian Ene wrote:
> > Request a PPI for each vCPU during probe which will be used by the host
> > to communicate a stall detected event on the vCPU. When the host raises
> > this interrupt from the virtual machine monitor, the guest is expected to
> > handle the interrupt and panic.
> >
> > Signed-off-by: Sebastian Ene <[email protected]>
> > ---
> > drivers/misc/vcpu_stall_detector.c | 41 ++++++++++++++++++++++++++++--
> > 1 file changed, 39 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/misc/vcpu_stall_detector.c b/drivers/misc/vcpu_stall_detector.c
> > index e2015c87f03f..c580cd7fd225 100644
> > --- a/drivers/misc/vcpu_stall_detector.c
> > +++ b/drivers/misc/vcpu_stall_detector.c
> > @@ -32,6 +32,7 @@
> > struct vcpu_stall_detect_config {
> > u32 clock_freq_hz;
> > u32 stall_timeout_sec;
> > + int ppi_irq;
> >
> > void __iomem *membase;
> > struct platform_device *dev;
> > @@ -77,6 +78,12 @@ vcpu_stall_detect_timer_fn(struct hrtimer *hrtimer)
> > return HRTIMER_RESTART;
> > }
> >
> > +static irqreturn_t vcpu_stall_detector_irq(int irq, void *dev)
> > +{
> > + panic("vCPU stall detector");
> > + return IRQ_HANDLED;
> > +}
> > +
> > static int start_stall_detector_cpu(unsigned int cpu)
> > {
> > u32 ticks, ping_timeout_ms;
> > @@ -132,7 +139,7 @@ static int stop_stall_detector_cpu(unsigned int cpu)
> >
> > static int vcpu_stall_detect_probe(struct platform_device *pdev)
> > {
> > - int ret;
> > + int ret, irq, num_irqs;
> > struct resource *r;
> > void __iomem *membase;
> > u32 clock_freq_hz = VCPU_STALL_DEFAULT_CLOCK_HZ;
> > @@ -169,9 +176,32 @@ static int vcpu_stall_detect_probe(struct platform_device *pdev)
> > vcpu_stall_config = (struct vcpu_stall_detect_config) {
> > .membase = membase,
> > .clock_freq_hz = clock_freq_hz,
> > - .stall_timeout_sec = stall_timeout_sec
> > + .stall_timeout_sec = stall_timeout_sec,
> > + .ppi_irq = -1,
> > };
> >
> > + num_irqs = platform_irq_count(pdev);
> > + if (num_irqs < 0) {
> > + dev_err(&pdev->dev, "Failed to get irqs\n");
Hello Conor,
>
> platform_irq_count() either returns a number or EPROBE_DEFER, I don't
> think emitting an error on deferred probe is the correct thing to do
> here?
I will drop this.
> > + ret = num_irqs;
> > + goto err;
> > + } else if (num_irqs > 1) {
> > + dev_err(&pdev->dev, "Multipple irqs detected\n");
>
> Typo. I don't really see why you're going to this level of complexity
> though, why aren't you just doing a single get_irq_optional()?
>
Thanks for the feedback, I simplified it by using the
platform_get_irq_optional as you suggested.
> > + ret = -EINVAL;
> > + goto err;
> > + } else if (num_irqs == 1) {
> > + irq = platform_get_irq(pdev, 0);
> > + if ((irq > 0) && irq_is_percpu_devid(irq)) {
> > + ret = request_percpu_irq(irq,
> > + vcpu_stall_detector_irq,
> > + "vcpu_stall_detector",
> > + vcpu_stall_detectors);
> > + if (!ret)
> > + vcpu_stall_config.ppi_irq = irq;
> > +
> > + }
> > + }
> > +
> > ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> > "virt/vcpu_stall_detector:online",
> > start_stall_detector_cpu,
> > @@ -184,6 +214,9 @@ static int vcpu_stall_detect_probe(struct platform_device *pdev)
> > vcpu_stall_config.hp_online = ret;
> > return 0;
> > err:
> > + if (vcpu_stall_config.ppi_irq > 0)
> > + free_percpu_irq(vcpu_stall_config.ppi_irq,
> > + vcpu_stall_detectors);
> > return ret;
> > }
> >
> > @@ -193,6 +226,10 @@ static void vcpu_stall_detect_remove(struct platform_device *pdev)
> >
> > cpuhp_remove_state(vcpu_stall_config.hp_online);
> >
> > + if (vcpu_stall_config.ppi_irq > 0)
> > + free_percpu_irq(vcpu_stall_config.ppi_irq,
> > + vcpu_stall_detectors);
> > +
> > for_each_possible_cpu(cpu)
> > stop_stall_detector_cpu(cpu);
> > }
> > --
> > 2.45.1.288.g0e0cd299f1-goog
> >
> >
Cheers,
Seb