The size of the buffer being bounced is not checked if it happens
to be larger than the size of the mapped buffer. Because the size
can be controlled by a device, as it's the case with virtio devices,
this can lead to memory corruption.
This patch saves the remaining buffer memory for each slab and uses
that information for validation in the sync/unmap paths before
swiotlb_bounce is called.
Validating this argument is important under the threat models of
AMD SEV-SNP and Intel TDX, where the HV is considered untrusted.
Signed-off-by: Martin Radev <[email protected]>
---
kernel/dma/swiotlb.c | 52 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 50 insertions(+), 2 deletions(-)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 7c42df6e6100..98d79103aa1f 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -102,6 +102,11 @@ static unsigned int max_segment;
#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
static phys_addr_t *io_tlb_orig_addr;
+/*
+ * The mapped buffer's size should be validated during a sync operation.
+ */
+static size_t *io_tlb_orig_size;
+
/*
* Protect the above data structures in the map and unmap calls
*/
@@ -240,9 +245,16 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
__func__, alloc_size, PAGE_SIZE);
+ alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t));
+ io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!io_tlb_orig_size)
+ panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+
for (i = 0; i < io_tlb_nslabs; i++) {
io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ io_tlb_orig_size[i] = 0;
}
io_tlb_index = 0;
no_iotlb_memory = false;
@@ -363,7 +375,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
* between io_tlb_start and io_tlb_end.
*/
io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
- get_order(io_tlb_nslabs * sizeof(int)));
+ get_order(io_tlb_nslabs * sizeof(int)));
if (!io_tlb_list)
goto cleanup3;
@@ -374,9 +386,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
if (!io_tlb_orig_addr)
goto cleanup4;
+ io_tlb_orig_size = (size_t *)
+ __get_free_pages(GFP_KERNEL,
+ get_order(io_tlb_nslabs *
+ sizeof(size_t)));
+ if (!io_tlb_orig_size)
+ goto cleanup5;
+
+
for (i = 0; i < io_tlb_nslabs; i++) {
io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ io_tlb_orig_size[i] = 0;
}
io_tlb_index = 0;
no_iotlb_memory = false;
@@ -389,6 +410,10 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
return 0;
+cleanup5:
+ free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
+ sizeof(phys_addr_t)));
+
cleanup4:
free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
sizeof(int)));
@@ -404,6 +429,8 @@ void __init swiotlb_exit(void)
return;
if (late_alloc) {
+ free_pages((unsigned long)io_tlb_orig_size,
+ get_order(io_tlb_nslabs * sizeof(size_t)));
free_pages((unsigned long)io_tlb_orig_addr,
get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
@@ -413,6 +440,8 @@ void __init swiotlb_exit(void)
} else {
memblock_free_late(__pa(io_tlb_orig_addr),
PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+ memblock_free_late(__pa(io_tlb_orig_size),
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)));
memblock_free_late(__pa(io_tlb_list),
PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
memblock_free_late(io_tlb_start,
@@ -581,8 +610,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
* This is needed when we sync the memory. Then we sync the buffer if
* needed.
*/
- for (i = 0; i < nslots; i++)
+ for (i = 0; i < nslots; i++) {
io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
+ io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT);
+ }
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
@@ -590,6 +621,17 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
return tlb_addr;
}
+static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size)
+{
+ if (*size > orig_size) {
+ /* Warn and truncate mapping_size */
+ dev_WARN_ONCE(hwdev, 1,
+ "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n",
+ orig_size, *size);
+ *size = orig_size;
+ }
+}
+
/*
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
@@ -602,6 +644,8 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = io_tlb_orig_addr[index];
+ validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size);
+
/*
* First, sync the memory before unmapping the entry
*/
@@ -627,6 +671,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
for (i = index + nslots - 1; i >= index; i--) {
io_tlb_list[i] = ++count;
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
+ io_tlb_orig_size[i] = 0;
}
/*
* Step 2: merge the returned slots with the preceding slots,
@@ -645,12 +690,15 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
enum dma_sync_target target)
{
int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ size_t orig_size = io_tlb_orig_size[index];
phys_addr_t orig_addr = io_tlb_orig_addr[index];
if (orig_addr == INVALID_PHYS_ADDR)
return;
orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
+ validate_sync_size_and_truncate(hwdev, orig_size, &size);
+
switch (target) {
case SYNC_FOR_CPU:
if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
--
2.17.1
On Tue, Jan 12, 2021 at 04:07:29PM +0100, Martin Radev wrote:
> The size of the buffer being bounced is not checked if it happens
> to be larger than the size of the mapped buffer. Because the size
> can be controlled by a device, as it's the case with virtio devices,
> this can lead to memory corruption.
>
I'm really worried about all these hodge podge hacks for not trusted
hypervisors in the I/O stack. Instead of trying to harden protocols
that are fundamentally not designed for this, how about instead coming
up with a new paravirtualized I/O interface that is specifically
designed for use with an untrusted hypervisor from the start?
On Wed, Jan 13, 2021 at 12:30:17PM +0100, Christoph Hellwig wrote:
> On Tue, Jan 12, 2021 at 04:07:29PM +0100, Martin Radev wrote:
> > The size of the buffer being bounced is not checked if it happens
> > to be larger than the size of the mapped buffer. Because the size
> > can be controlled by a device, as it's the case with virtio devices,
> > this can lead to memory corruption.
> >
>
> I'm really worried about all these hodge podge hacks for not trusted
> hypervisors in the I/O stack. Instead of trying to harden protocols
> that are fundamentally not designed for this, how about instead coming
> up with a new paravirtualized I/O interface that is specifically
> designed for use with an untrusted hypervisor from the start?
Your comment makes sense but then that would require the cooperation
of these vendors and the cloud providers to agree on something meaningful.
I am also not sure whether the end result would be better than hardening
this interface to catch corruption. There is already some validation in
unmap path anyway.
Another possibility is to move this hardening to the common virtio code,
but I think the code may become more complicated there since it would
require tracking both the dma_addr and length for each descriptor.
On Mon, Jan 18, 2021 at 12:44:58PM +0100, Martin Radev wrote:
> On Wed, Jan 13, 2021 at 12:30:17PM +0100, Christoph Hellwig wrote:
> > On Tue, Jan 12, 2021 at 04:07:29PM +0100, Martin Radev wrote:
> > > The size of the buffer being bounced is not checked if it happens
> > > to be larger than the size of the mapped buffer. Because the size
> > > can be controlled by a device, as it's the case with virtio devices,
> > > this can lead to memory corruption.
> > >
> >
> > I'm really worried about all these hodge podge hacks for not trusted
> > hypervisors in the I/O stack. Instead of trying to harden protocols
> > that are fundamentally not designed for this, how about instead coming
> > up with a new paravirtualized I/O interface that is specifically
> > designed for use with an untrusted hypervisor from the start?
>
> Your comment makes sense but then that would require the cooperation
> of these vendors and the cloud providers to agree on something meaningful.
> I am also not sure whether the end result would be better than hardening
> this interface to catch corruption. There is already some validation in
> unmap path anyway.
>
> Another possibility is to move this hardening to the common virtio code,
> but I think the code may become more complicated there since it would
> require tracking both the dma_addr and length for each descriptor.
Christoph,
I've been wrestling with the same thing - this is specific to busted
drivers. And in reality you could do the same thing with a hardware
virtio device (see example in http://thunderclap.io/) - where the
mitigation is 'enable the IOMMU to do its job.'.
AMD SEV documents speak about utilizing IOMMU to do this (AMD SEV-SNP)..
and while that is great in the future, SEV without IOMMU is now here.
Doing a full circle here, this issue can be exploited with virtio
but you could say do that with real hardware too if you hacked the
firmware, so if you say used Intel SR-IOV NIC that was compromised
on an AMD SEV machine, and plumbed in the guest - the IOMMU inside
of the guest would be SWIOTLB code. Last line of defense against
bad firmware to say.
As such I am leaning towards taking this code, but I am worried
about the performance hit .. but perhaps I shouldn't as if you
are using SWIOTLB=force already you are kind of taking a
performance hit?
On Mon, Jan 18, 2021 at 10:14:28AM -0500, Konrad Rzeszutek Wilk wrote:
> On Mon, Jan 18, 2021 at 12:44:58PM +0100, Martin Radev wrote:
> > On Wed, Jan 13, 2021 at 12:30:17PM +0100, Christoph Hellwig wrote:
> > > On Tue, Jan 12, 2021 at 04:07:29PM +0100, Martin Radev wrote:
> > > > The size of the buffer being bounced is not checked if it happens
> > > > to be larger than the size of the mapped buffer. Because the size
> > > > can be controlled by a device, as it's the case with virtio devices,
> > > > this can lead to memory corruption.
> > > >
> > >
> > > I'm really worried about all these hodge podge hacks for not trusted
> > > hypervisors in the I/O stack. Instead of trying to harden protocols
> > > that are fundamentally not designed for this, how about instead coming
> > > up with a new paravirtualized I/O interface that is specifically
> > > designed for use with an untrusted hypervisor from the start?
> >
> > Your comment makes sense but then that would require the cooperation
> > of these vendors and the cloud providers to agree on something meaningful.
> > I am also not sure whether the end result would be better than hardening
> > this interface to catch corruption. There is already some validation in
> > unmap path anyway.
> >
> > Another possibility is to move this hardening to the common virtio code,
> > but I think the code may become more complicated there since it would
> > require tracking both the dma_addr and length for each descriptor.
>
> Christoph,
>
> I've been wrestling with the same thing - this is specific to busted
> drivers. And in reality you could do the same thing with a hardware
> virtio device (see example in http://thunderclap.io/) - where the
> mitigation is 'enable the IOMMU to do its job.'.
>
> AMD SEV documents speak about utilizing IOMMU to do this (AMD SEV-SNP)..
> and while that is great in the future, SEV without IOMMU is now here.
>
> Doing a full circle here, this issue can be exploited with virtio
> but you could say do that with real hardware too if you hacked the
> firmware, so if you say used Intel SR-IOV NIC that was compromised
> on an AMD SEV machine, and plumbed in the guest - the IOMMU inside
> of the guest would be SWIOTLB code. Last line of defense against
> bad firmware to say.
>
> As such I am leaning towards taking this code, but I am worried
> about the performance hit .. but perhaps I shouldn't as if you
> are using SWIOTLB=force already you are kind of taking a
> performance hit?
>
I have not measured the performance degradation. This will hit all AMD SEV,
Intel TDX, IBM Protected Virtualization VMs. I don't expect the hit to
be large since there are only few added operations per hundreads of copied
bytes. I could try to measure the performance hit by running some benchmark
with virtio-net/virtio-blk/virtio-rng.
Earlier I said:
> > Another possibility is to move this hardening to the common virtio code,
> > but I think the code may become more complicated there since it would
> > require tracking both the dma_addr and length for each descriptor.
Unfortunately, this doesn't make sense. Even if there's validation for
the size in the common virtio layer, there will be some other device
which controls a dma_addr and length passed to dma_unmap* in the
corresponding driver. The device can target a specific dma-mapped private
buffer by changing the dma_addr and set a good length to overwrite buffers
following it.
So, instead of doing the check in every driver and hitting a performance
cost even when swiotlb is not used, it's probably better to fix it in
swiotlb.
@Tom Lendacky, do you think that it makes sense to harden swiotlb or
some other approach may be better for the SEV features?
On Mon, Jan 25, 2021 at 07:33:35PM +0100, Martin Radev wrote:
> On Mon, Jan 18, 2021 at 10:14:28AM -0500, Konrad Rzeszutek Wilk wrote:
> > On Mon, Jan 18, 2021 at 12:44:58PM +0100, Martin Radev wrote:
> > > On Wed, Jan 13, 2021 at 12:30:17PM +0100, Christoph Hellwig wrote:
> > > > On Tue, Jan 12, 2021 at 04:07:29PM +0100, Martin Radev wrote:
> > > > > The size of the buffer being bounced is not checked if it happens
> > > > > to be larger than the size of the mapped buffer. Because the size
> > > > > can be controlled by a device, as it's the case with virtio devices,
> > > > > this can lead to memory corruption.
> > > > >
> > > >
> > > > I'm really worried about all these hodge podge hacks for not trusted
> > > > hypervisors in the I/O stack. Instead of trying to harden protocols
> > > > that are fundamentally not designed for this, how about instead coming
> > > > up with a new paravirtualized I/O interface that is specifically
> > > > designed for use with an untrusted hypervisor from the start?
> > >
> > > Your comment makes sense but then that would require the cooperation
> > > of these vendors and the cloud providers to agree on something meaningful.
> > > I am also not sure whether the end result would be better than hardening
> > > this interface to catch corruption. There is already some validation in
> > > unmap path anyway.
> > >
> > > Another possibility is to move this hardening to the common virtio code,
> > > but I think the code may become more complicated there since it would
> > > require tracking both the dma_addr and length for each descriptor.
> >
> > Christoph,
> >
> > I've been wrestling with the same thing - this is specific to busted
> > drivers. And in reality you could do the same thing with a hardware
> > virtio device (see example in http://thunderclap.io/) - where the
> > mitigation is 'enable the IOMMU to do its job.'.
> >
> > AMD SEV documents speak about utilizing IOMMU to do this (AMD SEV-SNP)..
> > and while that is great in the future, SEV without IOMMU is now here.
> >
> > Doing a full circle here, this issue can be exploited with virtio
> > but you could say do that with real hardware too if you hacked the
> > firmware, so if you say used Intel SR-IOV NIC that was compromised
> > on an AMD SEV machine, and plumbed in the guest - the IOMMU inside
> > of the guest would be SWIOTLB code. Last line of defense against
> > bad firmware to say.
> >
> > As such I am leaning towards taking this code, but I am worried
> > about the performance hit .. but perhaps I shouldn't as if you
> > are using SWIOTLB=force already you are kind of taking a
> > performance hit?
> >
>
> I have not measured the performance degradation. This will hit all AMD SEV,
> Intel TDX, IBM Protected Virtualization VMs. I don't expect the hit to
> be large since there are only few added operations per hundreads of copied
> bytes. I could try to measure the performance hit by running some benchmark
> with virtio-net/virtio-blk/virtio-rng.
>
> Earlier I said:
> > > Another possibility is to move this hardening to the common virtio code,
> > > but I think the code may become more complicated there since it would
> > > require tracking both the dma_addr and length for each descriptor.
>
> Unfortunately, this doesn't make sense. Even if there's validation for
> the size in the common virtio layer, there will be some other device
> which controls a dma_addr and length passed to dma_unmap* in the
> corresponding driver. The device can target a specific dma-mapped private
> buffer by changing the dma_addr and set a good length to overwrite buffers
> following it.
>
> So, instead of doing the check in every driver and hitting a performance
> cost even when swiotlb is not used, it's probably better to fix it in
> swiotlb.
>
> @Tom Lendacky, do you think that it makes sense to harden swiotlb or
> some other approach may be better for the SEV features?
I am not Tom, but this change seems the right way forward regardless if
is TDX, AMD SEV, or any other architecture that encrypt memory and use
SWIOTLB.
Let me queue it up in development branch and do some regression testing.
>
On 2/2/21 10:37 AM, Konrad Rzeszutek Wilk wrote:
> On Mon, Jan 25, 2021 at 07:33:35PM +0100, Martin Radev wrote:
>> On Mon, Jan 18, 2021 at 10:14:28AM -0500, Konrad Rzeszutek Wilk wrote:
>>> On Mon, Jan 18, 2021 at 12:44:58PM +0100, Martin Radev wrote:
>>>> On Wed, Jan 13, 2021 at 12:30:17PM +0100, Christoph Hellwig wrote:
>>>>> On Tue, Jan 12, 2021 at 04:07:29PM +0100, Martin Radev wrote:
>>>>>> The size of the buffer being bounced is not checked if it happens
>>>>>> to be larger than the size of the mapped buffer. Because the size
>>>>>> can be controlled by a device, as it's the case with virtio devices,
>>>>>> this can lead to memory corruption.
>>>>>>
>>>>>
>>>>> I'm really worried about all these hodge podge hacks for not trusted
>>>>> hypervisors in the I/O stack. Instead of trying to harden protocols
>>>>> that are fundamentally not designed for this, how about instead coming
>>>>> up with a new paravirtualized I/O interface that is specifically
>>>>> designed for use with an untrusted hypervisor from the start?
>>>>
>>>> Your comment makes sense but then that would require the cooperation
>>>> of these vendors and the cloud providers to agree on something meaningful.
>>>> I am also not sure whether the end result would be better than hardening
>>>> this interface to catch corruption. There is already some validation in
>>>> unmap path anyway.
>>>>
>>>> Another possibility is to move this hardening to the common virtio code,
>>>> but I think the code may become more complicated there since it would
>>>> require tracking both the dma_addr and length for each descriptor.
>>>
>>> Christoph,
>>>
>>> I've been wrestling with the same thing - this is specific to busted
>>> drivers. And in reality you could do the same thing with a hardware
>>> virtio device (see example in https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fthunderclap.io%2F&data=04%7C01%7Cthomas.lendacky%40amd.com%7Cfc27af49d9a943699f6c08d8c798eed4%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637478806973542212%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=aUVqobkOSDfDhCAEauABOUvCAaIcw%2FTh07YFxeBjBDU%3D&reserved=0) - where the
>>> mitigation is 'enable the IOMMU to do its job.'.
>>>
>>> AMD SEV documents speak about utilizing IOMMU to do this (AMD SEV-SNP)..
>>> and while that is great in the future, SEV without IOMMU is now here.
>>>
>>> Doing a full circle here, this issue can be exploited with virtio
>>> but you could say do that with real hardware too if you hacked the
>>> firmware, so if you say used Intel SR-IOV NIC that was compromised
>>> on an AMD SEV machine, and plumbed in the guest - the IOMMU inside
>>> of the guest would be SWIOTLB code. Last line of defense against
>>> bad firmware to say.
>>>
>>> As such I am leaning towards taking this code, but I am worried
>>> about the performance hit .. but perhaps I shouldn't as if you
>>> are using SWIOTLB=force already you are kind of taking a
>>> performance hit?
>>>
>>
>> I have not measured the performance degradation. This will hit all AMD SEV,
>> Intel TDX, IBM Protected Virtualization VMs. I don't expect the hit to
>> be large since there are only few added operations per hundreads of copied
>> bytes. I could try to measure the performance hit by running some benchmark
>> with virtio-net/virtio-blk/virtio-rng.
>>
>> Earlier I said:
>>>> Another possibility is to move this hardening to the common virtio code,
>>>> but I think the code may become more complicated there since it would
>>>> require tracking both the dma_addr and length for each descriptor.
>>
>> Unfortunately, this doesn't make sense. Even if there's validation for
>> the size in the common virtio layer, there will be some other device
>> which controls a dma_addr and length passed to dma_unmap* in the
>> corresponding driver. The device can target a specific dma-mapped private
>> buffer by changing the dma_addr and set a good length to overwrite buffers
>> following it.
>>
>> So, instead of doing the check in every driver and hitting a performance
>> cost even when swiotlb is not used, it's probably better to fix it in
>> swiotlb.
>>
>> @Tom Lendacky, do you think that it makes sense to harden swiotlb or
>> some other approach may be better for the SEV features?
>
> I am not Tom, but this change seems the right way forward regardless if
> is TDX, AMD SEV, or any other architecture that encrypt memory and use
> SWIOTLB.
Sorry, I missed the @Tom before. I'm with Konrad and believe it makes
sense to add these checks.
I'm not sure if there would be a better approach for all confidential
computing technologies. SWIOTLB works nicely, but is limited because of
the 32-bit compatible memory location. Being able to have buffers above
the 32-bit limit would alleviate that, but that is support that would have
to be developed.
Thanks,
Tom
>
> Let me queue it up in development branch and do some regression testing.
>>
On Tue, Feb 02, 2021 at 04:34:09PM -0600, Tom Lendacky wrote:
> On 2/2/21 10:37 AM, Konrad Rzeszutek Wilk wrote:
> > On Mon, Jan 25, 2021 at 07:33:35PM +0100, Martin Radev wrote:
> >> On Mon, Jan 18, 2021 at 10:14:28AM -0500, Konrad Rzeszutek Wilk wrote:
> >>> On Mon, Jan 18, 2021 at 12:44:58PM +0100, Martin Radev wrote:
> >>>> On Wed, Jan 13, 2021 at 12:30:17PM +0100, Christoph Hellwig wrote:
> >>>>> On Tue, Jan 12, 2021 at 04:07:29PM +0100, Martin Radev wrote:
> >>>>>> The size of the buffer being bounced is not checked if it happens
> >>>>>> to be larger than the size of the mapped buffer. Because the size
> >>>>>> can be controlled by a device, as it's the case with virtio devices,
> >>>>>> this can lead to memory corruption.
> >>>>>>
> >>>>>
> >>>>> I'm really worried about all these hodge podge hacks for not trusted
> >>>>> hypervisors in the I/O stack. Instead of trying to harden protocols
> >>>>> that are fundamentally not designed for this, how about instead coming
> >>>>> up with a new paravirtualized I/O interface that is specifically
> >>>>> designed for use with an untrusted hypervisor from the start?
> >>>>
> >>>> Your comment makes sense but then that would require the cooperation
> >>>> of these vendors and the cloud providers to agree on something meaningful.
> >>>> I am also not sure whether the end result would be better than hardening
> >>>> this interface to catch corruption. There is already some validation in
> >>>> unmap path anyway.
> >>>>
> >>>> Another possibility is to move this hardening to the common virtio code,
> >>>> but I think the code may become more complicated there since it would
> >>>> require tracking both the dma_addr and length for each descriptor.
> >>>
> >>> Christoph,
> >>>
> >>> I've been wrestling with the same thing - this is specific to busted
> >>> drivers. And in reality you could do the same thing with a hardware
> >>> virtio device (see example in https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fthunderclap.io%2F&data=04%7C01%7Cthomas.lendacky%40amd.com%7Cfc27af49d9a943699f6c08d8c798eed4%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637478806973542212%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=aUVqobkOSDfDhCAEauABOUvCAaIcw%2FTh07YFxeBjBDU%3D&reserved=0) - where the
> >>> mitigation is 'enable the IOMMU to do its job.'.
> >>>
> >>> AMD SEV documents speak about utilizing IOMMU to do this (AMD SEV-SNP)..
> >>> and while that is great in the future, SEV without IOMMU is now here.
> >>>
> >>> Doing a full circle here, this issue can be exploited with virtio
> >>> but you could say do that with real hardware too if you hacked the
> >>> firmware, so if you say used Intel SR-IOV NIC that was compromised
> >>> on an AMD SEV machine, and plumbed in the guest - the IOMMU inside
> >>> of the guest would be SWIOTLB code. Last line of defense against
> >>> bad firmware to say.
> >>>
> >>> As such I am leaning towards taking this code, but I am worried
> >>> about the performance hit .. but perhaps I shouldn't as if you
> >>> are using SWIOTLB=force already you are kind of taking a
> >>> performance hit?
> >>>
> >>
> >> I have not measured the performance degradation. This will hit all AMD SEV,
> >> Intel TDX, IBM Protected Virtualization VMs. I don't expect the hit to
> >> be large since there are only few added operations per hundreads of copied
> >> bytes. I could try to measure the performance hit by running some benchmark
> >> with virtio-net/virtio-blk/virtio-rng.
> >>
> >> Earlier I said:
> >>>> Another possibility is to move this hardening to the common virtio code,
> >>>> but I think the code may become more complicated there since it would
> >>>> require tracking both the dma_addr and length for each descriptor.
> >>
> >> Unfortunately, this doesn't make sense. Even if there's validation for
> >> the size in the common virtio layer, there will be some other device
> >> which controls a dma_addr and length passed to dma_unmap* in the
> >> corresponding driver. The device can target a specific dma-mapped private
> >> buffer by changing the dma_addr and set a good length to overwrite buffers
> >> following it.
> >>
> >> So, instead of doing the check in every driver and hitting a performance
> >> cost even when swiotlb is not used, it's probably better to fix it in
> >> swiotlb.
> >>
> >> @Tom Lendacky, do you think that it makes sense to harden swiotlb or
> >> some other approach may be better for the SEV features?
> >
> > I am not Tom, but this change seems the right way forward regardless if
> > is TDX, AMD SEV, or any other architecture that encrypt memory and use
> > SWIOTLB.
>
> Sorry, I missed the @Tom before. I'm with Konrad and believe it makes
> sense to add these checks.
>
> I'm not sure if there would be a better approach for all confidential
> computing technologies. SWIOTLB works nicely, but is limited because of
> the 32-bit compatible memory location. Being able to have buffers above
> the 32-bit limit would alleviate that, but that is support that would have
> to be developed.
Funny you mention that.. Dongli (CCed) is working on exactly that and
should be posting his patches the next couple of days.
>
> Thanks,
> Tom
>
> >
> > Let me queue it up in development branch and do some regression testing.
> >>
On Mon, Jan 18, 2021 at 12:44:58PM +0100, Martin Radev wrote:
> Your comment makes sense but then that would require the cooperation
> of these vendors and the cloud providers to agree on something meaningful.
> I am also not sure whether the end result would be better than hardening
> this interface to catch corruption. There is already some validation in
> unmap path anyway.
So what? If you guys want to provide a new capability you'll have to do
work. And designing a new protocol based around the fact that the
hardware/hypervisor is not trusted and a copy is always required makes
a lot of more sense than throwing in band aids all over the place.
On Wed, Feb 03, 2021 at 01:49:22PM +0100, Christoph Hellwig wrote:
> On Mon, Jan 18, 2021 at 12:44:58PM +0100, Martin Radev wrote:
> > Your comment makes sense but then that would require the cooperation
> > of these vendors and the cloud providers to agree on something meaningful.
> > I am also not sure whether the end result would be better than hardening
> > this interface to catch corruption. There is already some validation in
> > unmap path anyway.
>
> So what? If you guys want to provide a new capability you'll have to do
> work. And designing a new protocol based around the fact that the
> hardware/hypervisor is not trusted and a copy is always required makes
> a lot of more sense than throwing in band aids all over the place.
If you don't trust the hypervisor, what would this capability be in?
I suppose you mean this would need to be in the the guest kernel and
this protocol would depend on .. not-hypervisor and most certainly not
the virtio or any SR-IOV device. That removes a lot of options.
The one sensibile one (since folks will trust OEM vendors like Intel
or AMD to provide the memory encryption so they will also trust the
IOMMU - I hope?) - and they do have plans for that with their IOMMU
frameworks which will remove the need for SWIOTLB (I hope).
But that is not now, but in future.
On Wed, Feb 03, 2021 at 02:36:38PM -0500, Konrad Rzeszutek Wilk wrote:
> > So what? If you guys want to provide a new capability you'll have to do
> > work. And designing a new protocol based around the fact that the
> > hardware/hypervisor is not trusted and a copy is always required makes
> > a lot of more sense than throwing in band aids all over the place.
>
> If you don't trust the hypervisor, what would this capability be in?
Well, they don't trust the hypervisor to not attack the guest somehow,
except through the data read. I never really understood the concept,
as it leaves too many holes.
But the point is that these schemes want to force bounce buffering
because they think it is more secure. And if that is what you want
you better have protocol build around the fact that each I/O needs
to use bounce buffers, so you make those buffers the actual shared
memory use for communication, and build the protocol around it.
E.g. you don't force the ridiculous NVMe PRP offset rules on the block
layer, just to make a complicated swiotlb allocation that needs to
preserve the alignment just do I/O. But instead you have a trivial
ring buffer or whatever because you know I/O will be copied anyway
and none of all the hard work higher layers do to make the I/O suitable
for a normal device apply.
On Fri, Feb 05, 2021 at 06:58:52PM +0100, Christoph Hellwig wrote:
> On Wed, Feb 03, 2021 at 02:36:38PM -0500, Konrad Rzeszutek Wilk wrote:
> > > So what? If you guys want to provide a new capability you'll have to do
> > > work. And designing a new protocol based around the fact that the
> > > hardware/hypervisor is not trusted and a copy is always required makes
> > > a lot of more sense than throwing in band aids all over the place.
> >
> > If you don't trust the hypervisor, what would this capability be in?
>
> Well, they don't trust the hypervisor to not attack the guest somehow,
> except through the data read. I never really understood the concept,
> as it leaves too many holes.
>
> But the point is that these schemes want to force bounce buffering
> because they think it is more secure. And if that is what you want
> you better have protocol build around the fact that each I/O needs
> to use bounce buffers, so you make those buffers the actual shared
> memory use for communication, and build the protocol around it.
Right. That is what the SWIOTLB pool ends up being as it is allocated at
bootup where the guest tells the hypervisor - these are shared and
clear-text.
> E.g. you don't force the ridiculous NVMe PRP offset rules on the block
> layer, just to make a complicated swiotlb allocation that needs to
> preserve the alignment just do I/O. But instead you have a trivial
I agree that NVMe is being silly. It could have allocated the coherent
pool and use that and do its own offset within that. That would in
essence carve out a static pool within the SWIOTLB static one..
TTM does that - it has its own DMA machinery on top of DMA API to deal
with its "passing" buffers from one application to another and the fun
of keeping track of that.
> ring buffer or whatever because you know I/O will be copied anyway
> and none of all the hard work higher layers do to make the I/O suitable
> for a normal device apply.
I lost you here. Sorry, are you saying have a simple ring protocol
(like NVME has), where the ring entries (SG or DMA phys) are statically
allocated and whenever NVME driver gets data from user-space it
would copy it in there?
On Mon, Feb 08, 2021 at 12:14:49PM -0500, Konrad Rzeszutek Wilk wrote:
> > ring buffer or whatever because you know I/O will be copied anyway
> > and none of all the hard work higher layers do to make the I/O suitable
> > for a normal device apply.
>
> I lost you here. Sorry, are you saying have a simple ring protocol
> (like NVME has), where the ring entries (SG or DMA phys) are statically
> allocated and whenever NVME driver gets data from user-space it
> would copy it in there?
Yes. Basically extend the virtio or NVMe ring/queue concept to not just
cover commands and completions, but also the data transfers.