2022-08-20 08:04:18

by Dongli Zhang

[permalink] [raw]
Subject: [PATCH RFC v2 1/1] wiotlb: split buffer into 32-bit default and 64-bit extra zones

Hello,

I used to send out RFC v1 to introduce an extra io_tlb_mem (created with
SWIOTLB_ANY) in addition to the default io_tlb_mem (32-bit). The
dev->dma_io_tlb_mem is set to either default or the extra io_tlb_mem,
depending on dma mask. However, that is not good for setting
dev->dma_io_tlb_mem at swiotlb layer transparently as suggested by
Christoph Hellwig.

https://lore.kernel.org/all/[email protected]/

Therefore, this is another RFC v2 implementation following a different
direction. The core ideas are:

1. The swiotlb is splited into two zones, io_tlb_mem->zone[0] (32-bit) and
io_tlb_mem->zone[1] (64-bit).

struct io_tlb_mem {
struct io_tlb_zone zone[SWIOTLB_NR];
struct dentry *debugfs;
bool late_alloc;
bool force_bounce;
bool for_alloc;
bool has_extra;
};

struct io_tlb_zone {
phys_addr_t start;
phys_addr_t end;
void *vaddr;
unsigned long nslabs;
unsigned long used;
unsigned int nareas;
unsigned int area_nslabs;
struct io_tlb_area *areas;
struct io_tlb_slot *slots;
};

2. By default, only io_tlb_mem->zone[0] is available. The
io_tlb_mem->zone[1] is allocated conditionally if:

- the "swiotlb=" is configured to allocate extra buffer, and
- the SWIOTLB_EXTRA is set in the flag (this is to make sure arch(s) other
than x86/sev/xen will not enable it until it is fully tested by each
arch, e.g., mips/powerpc). Currently it is enabled for x86 and xen.

3. During swiotlb map, whether zone[0] (32-bit) or zone[1] (64-bit
SWIOTLB_ANY)
is used depends on min_not_zero(*dev->dma_mask, dev->bus_dma_limit).

To test the RFC v2, here is the QEMU command line.

qemu-system-x86_64 -smp 8 -m 32G -enable-kvm -vnc :5 -hda disk.img \
-kernel path-to-linux/arch/x86_64/boot/bzImage \
-append "root=/dev/sda1 init=/sbin/init text console=ttyS0 loglevel=7 swiotlb=32768,4194304,force" \
-net nic -net user,hostfwd=tcp::5025-:22 \
-device nvme,drive=nvme01,serial=helloworld -drive file=test.qcow2,if=none,id=nvme01 \
-serial stdio

There is below in syslog. The extra 8GB buffer is allocated.

[ 0.152251] software IO TLB: area num 8.
... ...
[ 3.706088] PCI-DMA: Using software bounce buffering for IO (SWIOTLB)
[ 3.707334] software IO TLB: mapped default [mem 0x00000000bbfd7000-0x00000000bffd7000] (64MB)
[ 3.708585] software IO TLB: mapped extra [mem 0x000000061cc00000-0x000000081cc00000] (8192MB)

After the FIO is triggered over NVMe, the 64-bit buffer is used.

$ cat /sys/kernel/debug/swiotlb/io_tlb_nslabs_extra
4194304
$ cat /sys/kernel/debug/swiotlb/io_tlb_used_extra
327552

Would you mind helping if this is the right direction to go?

Thank you very much!

Cc: Konrad Wilk <[email protected]>
Cc: Joe Jin <[email protected]>
Signed-off-by: Dongli Zhang <[email protected]>
---
arch/arm/xen/mm.c | 2 +-
arch/mips/pci/pci-octeon.c | 5 +-
arch/x86/include/asm/xen/swiotlb-xen.h | 2 +-
arch/x86/kernel/pci-dma.c | 6 +-
drivers/xen/swiotlb-xen.c | 18 +-
include/linux/swiotlb.h | 73 +++--
kernel/dma/swiotlb.c | 499 +++++++++++++++++++++------------
7 files changed, 388 insertions(+), 217 deletions(-)

diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index 3d826c0..4edfa42 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -125,7 +125,7 @@ static int __init xen_mm_init(void)
return 0;

/* we can work with the default swiotlb */
- if (!io_tlb_default_mem.nslabs) {
+ if (!io_tlb_default_mem.zone[SWIOTLB_DF].nslabs) {
rc = swiotlb_init_late(swiotlb_size_or_default(),
xen_swiotlb_gfp(), NULL);
if (rc < 0)
diff --git a/arch/mips/pci/pci-octeon.c b/arch/mips/pci/pci-octeon.c
index e457a18..0bf0859 100644
--- a/arch/mips/pci/pci-octeon.c
+++ b/arch/mips/pci/pci-octeon.c
@@ -654,6 +654,9 @@ static int __init octeon_pci_setup(void)
octeon_pci_mem_resource.end =
octeon_pci_mem_resource.start + (1ul << 30);
} else {
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_zone *zone = &mem->zone[SWIOTLB_DF];
+
/* Remap the Octeon BAR 0 to map 128MB-(128MB+4KB) */
octeon_npi_write32(CVMX_NPI_PCI_CFG04, 128ul << 20);
octeon_npi_write32(CVMX_NPI_PCI_CFG05, 0);
@@ -664,7 +667,7 @@ static int __init octeon_pci_setup(void)

/* BAR1 movable regions contiguous to cover the swiotlb */
octeon_bar1_pci_phys =
- io_tlb_default_mem.start & ~((1ull << 22) - 1);
+ zone->start & ~((1ull << 22) - 1);

for (index = 0; index < 32; index++) {
union cvmx_pci_bar1_indexx bar1_index;
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
index 77a2d19..9476a9f 100644
--- a/arch/x86/include/asm/xen/swiotlb-xen.h
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -8,7 +8,7 @@
static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; }
#endif

-int xen_swiotlb_fixup(void *buf, unsigned long nslabs);
+int xen_swiotlb_fixup(void *buf, unsigned long nslabs, unsigned int flags);
int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
unsigned int address_bits,
dma_addr_t *dma_handle);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 30bbe4a..a8736e7 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -77,7 +77,7 @@ static void __init pci_xen_swiotlb_init(void)
if (!xen_initial_domain() && !x86_swiotlb_enable)
return;
x86_swiotlb_enable = true;
- x86_swiotlb_flags |= SWIOTLB_ANY;
+ x86_swiotlb_flags |= SWIOTLB_ANY | SWIOTLB_EXTRA;
swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup);
dma_ops = &xen_swiotlb_dma_ops;
if (IS_ENABLED(CONFIG_PCI))
@@ -90,7 +90,7 @@ int pci_xen_swiotlb_init_late(void)
return 0;

/* we can work with the default swiotlb */
- if (!io_tlb_default_mem.nslabs) {
+ if (!io_tlb_default_mem.zone[SWIOTLB_DF].nslabs) {
int rc = swiotlb_init_late(swiotlb_size_or_default(),
GFP_KERNEL, xen_swiotlb_fixup);
if (rc < 0)
@@ -120,6 +120,8 @@ void __init pci_iommu_alloc(void)
gart_iommu_hole_init();
amd_iommu_detect();
detect_intel_iommu();
+
+ x86_swiotlb_flags |= SWIOTLB_EXTRA;
swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags);
}

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 67aa74d..3ff082b 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -38,7 +38,6 @@
#include <asm/dma-mapping.h>

#include <trace/events/swiotlb.h>
-#define MAX_DMA_BITS 32

/*
* Quick lookup value of the bus address of the IOTLB.
@@ -104,24 +103,32 @@ static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr)
}

#ifdef CONFIG_X86
-int xen_swiotlb_fixup(void *buf, unsigned long nslabs)
+int xen_swiotlb_fixup(void *buf, unsigned long nslabs, unsigned int flags)
{
int rc;
unsigned int order = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT);
unsigned int i, dma_bits = order + PAGE_SHIFT;
dma_addr_t dma_handle;
phys_addr_t p = virt_to_phys(buf);
+ unsigned int max_dma_bits = 32;

BUILD_BUG_ON(IO_TLB_SEGSIZE & (IO_TLB_SEGSIZE - 1));
BUG_ON(nslabs % IO_TLB_SEGSIZE);

+#ifdef CONFIG_X86_64
+ if (flags & SWIOTLB_EXTRA) {
+ max_dma_bits = 64;
+ dma_bits = 64;
+ }
+#endif
+
i = 0;
do {
do {
rc = xen_create_contiguous_region(
p + (i << IO_TLB_SHIFT), order,
dma_bits, &dma_handle);
- } while (rc && dma_bits++ < MAX_DMA_BITS);
+ } while (rc && dma_bits++ < max_dma_bits);
if (rc)
return rc;

@@ -381,7 +388,10 @@ static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
static int
xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
{
- return xen_phys_to_dma(hwdev, io_tlb_default_mem.end - 1) <= mask;
+ struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_zone *zone = &mem->zone[SWIOTLB_DF];
+
+ return xen_phys_to_dma(hwdev, zone->end - 1) <= mask;
}

const struct dma_map_ops xen_swiotlb_dma_ops = {
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 35bc4e2..365e5d7 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -16,6 +16,11 @@
#define SWIOTLB_VERBOSE (1 << 0) /* verbose initialization */
#define SWIOTLB_FORCE (1 << 1) /* force bounce buffering */
#define SWIOTLB_ANY (1 << 2) /* allow any memory for the buffer */
+#define SWIOTLB_EXTRA (1 << 3) /* allow extra buffer if supported */
+
+#define SWIOTLB_DF 0
+#define SWIOTLB_EX 1
+#define SWIOTLB_NR 2

/*
* Maximum allowable number of contiguous slabs to map,
@@ -36,9 +41,9 @@

unsigned long swiotlb_size_or_default(void);
void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
- int (*remap)(void *tlb, unsigned long nslabs));
+ int (*remap)(void *tlb, unsigned long nslabs, unsigned int flags));
int swiotlb_init_late(size_t size, gfp_t gfp_mask,
- int (*remap)(void *tlb, unsigned long nslabs));
+ int (*remap)(void *tlb, unsigned long nslabs, unsigned int flags));
extern void __init swiotlb_update_mem_attributes(void);

phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
@@ -62,54 +67,78 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
#ifdef CONFIG_SWIOTLB

/**
- * struct io_tlb_mem - IO TLB Memory Pool Descriptor
+ * struct io_tlb_zone - IO TLB Memory Zone Descriptor
*
- * @start: The start address of the swiotlb memory pool. Used to do a quick
+ * @start: The start address of the swiotlb memory zone. Used to do a quick
* range check to see if the memory was in fact allocated by this
* API.
- * @end: The end address of the swiotlb memory pool. Used to do a quick
+ * @end: The end address of the swiotlb memory zone. Used to do a quick
* range check to see if the memory was in fact allocated by this
* API.
- * @vaddr: The vaddr of the swiotlb memory pool. The swiotlb memory pool
+ * @vaddr: The vaddr of the swiotlb memory zone. The swiotlb memory zone
* may be remapped in the memory encrypted case and store virtual
* address for bounce buffer operation.
* @nslabs: The number of IO TLB blocks (in groups of 64) between @start and
* @end. For default swiotlb, this is command line adjustable via
* setup_io_tlb_npages.
* @used: The number of used IO TLB block.
- * @list: The free list describing the number of free entries available
- * from each index.
- * @orig_addr: The original address corresponding to a mapped entry.
- * @alloc_size: Size of the allocated buffer.
- * @debugfs: The dentry to debugfs.
- * @late_alloc: %true if allocated using the page allocator
- * @force_bounce: %true if swiotlb bouncing is forced
- * @for_alloc: %true if the pool is used for memory allocation
- * @nareas: The area number in the pool.
- * @area_nslabs: The slot number in the area.
+ * @nareas: The area number in the zone.
+ * @area_nslabs:The slot number in the area.
+ * @area: The IO TLB memory area descriptor.
+ * @slots: The data to track memory allocation.
*/
-struct io_tlb_mem {
+struct io_tlb_zone {
phys_addr_t start;
phys_addr_t end;
void *vaddr;
unsigned long nslabs;
unsigned long used;
- struct dentry *debugfs;
- bool late_alloc;
- bool force_bounce;
- bool for_alloc;
unsigned int nareas;
unsigned int area_nslabs;
struct io_tlb_area *areas;
struct io_tlb_slot *slots;
};
+
+/**
+ * struct io_tlb_mem - IO TLB Memory Pool Descriptor
+ *
+ * @zone: IO TLB memory zone descriptor.
+ * @debugfs: The dentry to debugfs.
+ * @late_alloc: %true if allocated using the page allocator.
+ * @force_bounce: %true if swiotlb bouncing is forced.
+ * @for_alloc: %true if the pool is used for memory allocation.
+ * @has_extra: %true if the extra zone (e.g., 64-bit) is available.
+ */
+struct io_tlb_mem {
+ struct io_tlb_zone zone[SWIOTLB_NR];
+ struct dentry *debugfs;
+ bool late_alloc;
+ bool force_bounce;
+ bool for_alloc;
+ bool has_extra;
+};
extern struct io_tlb_mem io_tlb_default_mem;

static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_zone *zone;
+ int end_zone;
+ int i;
+
+ if (!mem)
+ return false;
+
+ end_zone = mem->has_extra ? SWIOTLB_EX : SWIOTLB_DF;

- return mem && paddr >= mem->start && paddr < mem->end;
+ for (i = end_zone ; i >= 0 ; i--) {
+ zone = &mem->zone[i];
+
+ if (paddr >= zone->start && paddr < zone->end)
+ return true;
+ }
+
+ return false;
}

static inline bool is_swiotlb_force_bounce(struct device *dev)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c5a9190..392f21b 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -76,6 +76,7 @@ struct io_tlb_slot {
phys_addr_t swiotlb_unencrypted_base;

static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static unsigned long extra_nslabs;
static unsigned long default_nareas;

/**
@@ -96,24 +97,24 @@ struct io_tlb_area {

/*
* Round up number of slabs to the next power of 2. The last area is going
- * be smaller than the rest if default_nslabs is not power of two.
+ * be smaller than the rest if *nslabs_ptr is not power of two.
* The number of slot in an area should be a multiple of IO_TLB_SEGSIZE,
* otherwise a segment may span two or more areas. It conflicts with free
* contiguous slots tracking: free slots are treated contiguous no matter
* whether they cross an area boundary.
*
- * Return true if default_nslabs is rounded up.
+ * Return true if *nslabs_ptr is rounded up.
*/
-static bool round_up_default_nslabs(void)
+static bool round_up_nslabs_config(unsigned long *nslabs_ptr)
{
if (!default_nareas)
return false;

- if (default_nslabs < IO_TLB_SEGSIZE * default_nareas)
- default_nslabs = IO_TLB_SEGSIZE * default_nareas;
- else if (is_power_of_2(default_nslabs))
+ if (*nslabs_ptr < IO_TLB_SEGSIZE * default_nareas)
+ *nslabs_ptr = IO_TLB_SEGSIZE * default_nareas;
+ else if (is_power_of_2(*nslabs_ptr))
return false;
- default_nslabs = roundup_pow_of_two(default_nslabs);
+ *nslabs_ptr = roundup_pow_of_two(*nslabs_ptr);
return true;
}

@@ -128,9 +129,14 @@ static void swiotlb_adjust_nareas(unsigned int nareas)
default_nareas = nareas;

pr_info("area num %d.\n", nareas);
- if (round_up_default_nslabs())
+
+ if (round_up_nslabs_config(&default_nslabs))
pr_info("SWIOTLB bounce buffer size roundup to %luMB",
(default_nslabs << IO_TLB_SHIFT) >> 20);
+
+ if (extra_nslabs && round_up_nslabs_config(&extra_nslabs))
+ pr_info("SWIOTLB extra bounce buffer size roundup to %luMB",
+ (extra_nslabs << IO_TLB_SHIFT) >> 20);
}

static int __init
@@ -144,6 +150,11 @@ static void swiotlb_adjust_nareas(unsigned int nareas)
if (*str == ',')
++str;
if (isdigit(*str))
+ extra_nslabs =
+ ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE);
+ if (*str == ',')
+ ++str;
+ if (isdigit(*str))
swiotlb_adjust_nareas(simple_strtoul(str, &str, 0));
if (*str == ',')
++str;
@@ -158,9 +169,11 @@ static void swiotlb_adjust_nareas(unsigned int nareas)

unsigned int swiotlb_max_segment(void)
{
- if (!io_tlb_default_mem.nslabs)
+ struct io_tlb_zone *zone = &io_tlb_default_mem.zone[SWIOTLB_DF];
+
+ if (!zone->nslabs)
return 0;
- return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE);
+ return rounddown(zone->nslabs << IO_TLB_SHIFT, PAGE_SIZE);
}
EXPORT_SYMBOL_GPL(swiotlb_max_segment);

@@ -181,22 +194,49 @@ void __init swiotlb_adjust_size(unsigned long size)

size = ALIGN(size, IO_TLB_SIZE);
default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
- if (round_up_default_nslabs())
+ if (round_up_nslabs_config(&default_nslabs))
size = default_nslabs << IO_TLB_SHIFT;
- pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
+ pr_info("SWIOTLB default bounce buffer size adjusted to %luMB",
+ size >> 20);
}

void swiotlb_print_info(void)
{
struct io_tlb_mem *mem = &io_tlb_default_mem;
+ int nr_zone = mem->has_extra ? SWIOTLB_NR : SWIOTLB_NR - 1;
+ struct io_tlb_zone *zone;
+ int i;

- if (!mem->nslabs) {
+ if (!mem->zone[SWIOTLB_DF].nslabs) {
pr_warn("No low mem\n");
return;
}

- pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end,
- (mem->nslabs << IO_TLB_SHIFT) >> 20);
+ for (i = 0; i < nr_zone; i++) {
+ zone = &mem->zone[i];
+ pr_info("mapped %s [mem %pa-%pa] (%luMB)\n",
+ i == 0 ? "default" : "extra",
+ &zone->start, &zone->end,
+ (zone->nslabs << IO_TLB_SHIFT) >> 20);
+ }
+}
+
+static int swiotlb_zone_index(struct io_tlb_mem *mem, phys_addr_t paddr)
+{
+ struct io_tlb_zone *zone;
+ int end_zone;
+ int i;
+
+ end_zone = mem->has_extra ? SWIOTLB_EX : SWIOTLB_DF;
+
+ for (i = end_zone ; i >= 0 ; i--) {
+ zone = &mem->zone[i];
+
+ if (paddr >= zone->start && paddr < zone->end)
+ return i;
+ }
+
+ return -EINVAL;
}

static inline unsigned long io_tlb_offset(unsigned long val)
@@ -215,12 +255,13 @@ static inline unsigned long nr_slots(u64 val)
* Isolation VMs).
*/
#ifdef CONFIG_HAS_IOMEM
-static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
+static void *swiotlb_mem_remap(struct io_tlb_zone *zone,
+ unsigned long bytes)
{
void *vaddr = NULL;

if (swiotlb_unencrypted_base) {
- phys_addr_t paddr = mem->start + swiotlb_unencrypted_base;
+ phys_addr_t paddr = zone->start + swiotlb_unencrypted_base;

vaddr = memremap(paddr, bytes, MEMREMAP_WB);
if (!vaddr)
@@ -231,7 +272,8 @@ static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
return vaddr;
}
#else
-static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
+static void *swiotlb_mem_remap(struct io_tlb_zone *zone,
+ unsigned long bytes)
{
return NULL;
}
@@ -246,46 +288,53 @@ static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
void __init swiotlb_update_mem_attributes(void)
{
struct io_tlb_mem *mem = &io_tlb_default_mem;
+ int nr_zone = mem->has_extra ? SWIOTLB_NR : SWIOTLB_NR - 1;
void *vaddr;
unsigned long bytes;
+ int i;

- if (!mem->nslabs || mem->late_alloc)
+ if (mem->late_alloc)
return;
- vaddr = phys_to_virt(mem->start);
- bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
- set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);

- mem->vaddr = swiotlb_mem_remap(mem, bytes);
- if (!mem->vaddr)
- mem->vaddr = vaddr;
+ for (i = 0; i < nr_zone; i++) {
+ struct io_tlb_zone *zone = &mem->zone[i];
+
+ if (!zone->nslabs)
+ continue;
+ vaddr = phys_to_virt(zone->start);
+ bytes = PAGE_ALIGN(zone->nslabs << IO_TLB_SHIFT);
+ set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
+
+ zone->vaddr = swiotlb_mem_remap(zone, bytes);
+ if (!zone->vaddr)
+ zone->vaddr = vaddr;
+ }
}

-static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
- unsigned long nslabs, unsigned int flags,
- bool late_alloc, unsigned int nareas)
+static void swiotlb_init_io_tlb_zone(struct io_tlb_zone *zone,
+ phys_addr_t start,
+ unsigned long nslabs,
+ unsigned int nareas)
{
void *vaddr = phys_to_virt(start);
unsigned long bytes = nslabs << IO_TLB_SHIFT, i;

- mem->nslabs = nslabs;
- mem->start = start;
- mem->end = mem->start + bytes;
- mem->late_alloc = late_alloc;
- mem->nareas = nareas;
- mem->area_nslabs = nslabs / mem->nareas;
-
- mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
+ zone->nslabs = nslabs;
+ zone->start = start;
+ zone->end = zone->start + bytes;
+ zone->nareas = nareas;
+ zone->area_nslabs = nslabs / zone->nareas;

- for (i = 0; i < mem->nareas; i++) {
- spin_lock_init(&mem->areas[i].lock);
- mem->areas[i].index = 0;
- mem->areas[i].used = 0;
+ for (i = 0; i < zone->nareas; i++) {
+ spin_lock_init(&zone->areas[i].lock);
+ zone->areas[i].index = 0;
+ zone->areas[i].used = 0;
}

- for (i = 0; i < mem->nslabs; i++) {
- mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
- mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
- mem->slots[i].alloc_size = 0;
+ for (i = 0; i < zone->nslabs; i++) {
+ zone->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+ zone->slots[i].orig_addr = INVALID_PHYS_ADDR;
+ zone->slots[i].alloc_size = 0;
}

/*
@@ -296,44 +345,19 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
return;

memset(vaddr, 0, bytes);
- mem->vaddr = vaddr;
+ zone->vaddr = vaddr;
return;
}

-/*
- * Statically reserve bounce buffer space and initialize bounce buffer data
- * structures for the software IO TLB used to implement the DMA API.
- */
-void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
- int (*remap)(void *tlb, unsigned long nslabs))
+void __init swiotlb_init_remap_zone(struct io_tlb_zone *zone,
+ unsigned long nslabs, unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs,
+ unsigned int flags))
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
- unsigned long nslabs;
size_t alloc_size;
size_t bytes;
void *tlb;

- if (!addressing_limit && !swiotlb_force_bounce)
- return;
- if (swiotlb_force_disable)
- return;
-
- /*
- * default_nslabs maybe changed when adjust area number.
- * So allocate bounce buffer after adjusting area number.
- */
- if (!default_nareas)
- swiotlb_adjust_nareas(num_possible_cpus());
-
- nslabs = default_nslabs;
- if (nslabs < IO_TLB_MIN_SLABS)
- panic("%s: nslabs = %lu too small\n", __func__, nslabs);
-
- /*
- * By default allocate the bounce buffer memory from low memory, but
- * allow to pick a location everywhere for hypervisors with guest
- * memory encryption.
- */
retry:
bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
if (flags & SWIOTLB_ANY)
@@ -346,7 +370,7 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
return;
}

- if (remap && remap(tlb, nslabs) < 0) {
+ if (remap && remap(tlb, nslabs, flags) < 0) {
memblock_free(tlb, PAGE_ALIGN(bytes));

nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
@@ -356,19 +380,58 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
goto retry;
}

- alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
- mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
- if (!mem->slots)
+ alloc_size = PAGE_ALIGN(array_size(sizeof(*zone->slots), nslabs));
+ zone->slots = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!zone->slots)
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
__func__, alloc_size, PAGE_SIZE);

- mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
+ zone->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
default_nareas), SMP_CACHE_BYTES);
- if (!mem->areas)
- panic("%s: Failed to allocate mem->areas.\n", __func__);
+ if (!zone->areas)
+ panic("%s: Failed to allocate zone->areas.\n", __func__);
+
+ swiotlb_init_io_tlb_zone(zone, __pa(tlb), nslabs, default_nareas);
+}
+
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the DMA API.
+ */
+void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs,
+ unsigned int flags))
+{
+ struct io_tlb_mem *mem = &io_tlb_default_mem;

- swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false,
- default_nareas);
+ if (!addressing_limit && !swiotlb_force_bounce)
+ return;
+ if (swiotlb_force_disable)
+ return;
+
+ /*
+ * Both default_nslabs and extra_nslabs maybe changed when adjust
+ * area number.
+ * So allocate bounce buffer after adjusting area number.
+ */
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
+ if (default_nslabs < IO_TLB_MIN_SLABS)
+ panic("%s: default_nslabs = %lu too small\n", __func__,
+ default_nslabs);
+
+ swiotlb_init_remap_zone(&mem->zone[SWIOTLB_DF], default_nslabs,
+ flags & ~SWIOTLB_EXTRA, remap);
+ if (extra_nslabs && (flags & SWIOTLB_EXTRA)) {
+ swiotlb_init_remap_zone(&mem->zone[SWIOTLB_EX],
+ extra_nslabs,
+ flags | SWIOTLB_ANY, remap);
+ mem->has_extra = true;
+ }
+
+ mem->late_alloc = false;
+ mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE);

if (flags & SWIOTLB_VERBOSE)
swiotlb_print_info();
@@ -385,9 +448,11 @@ void __init swiotlb_init(bool addressing_limit, unsigned int flags)
* This should be just like above, but with some error catching.
*/
int swiotlb_init_late(size_t size, gfp_t gfp_mask,
- int (*remap)(void *tlb, unsigned long nslabs))
+ int (*remap)(void *tlb, unsigned long nslabs,
+ unsigned int flags))
{
struct io_tlb_mem *mem = &io_tlb_default_mem;
+ struct io_tlb_zone *zone = &mem->zone[SWIOTLB_DF];
unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
unsigned char *vstart = NULL;
unsigned int order, area_order;
@@ -415,7 +480,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
return -ENOMEM;

if (remap)
- rc = remap(vstart, nslabs);
+ rc = remap(vstart, nslabs, 0);
if (rc) {
free_pages((unsigned long)vstart, order);

@@ -434,28 +499,31 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
if (!default_nareas)
swiotlb_adjust_nareas(num_possible_cpus());

- area_order = get_order(array_size(sizeof(*mem->areas),
+ area_order = get_order(array_size(sizeof(*zone->areas),
default_nareas));
- mem->areas = (struct io_tlb_area *)
+ zone->areas = (struct io_tlb_area *)
__get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
- if (!mem->areas)
+ if (!zone->areas)
goto error_area;

- mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(array_size(sizeof(*mem->slots), nslabs)));
- if (!mem->slots)
+ zone->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(array_size(sizeof(*zone->slots), nslabs)));
+ if (!zone->slots)
goto error_slots;

set_memory_decrypted((unsigned long)vstart,
(nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true,
- default_nareas);
+ swiotlb_init_io_tlb_zone(zone, virt_to_phys(vstart), nslabs,
+ default_nareas);
+
+ mem->late_alloc = true;
+ mem->force_bounce = swiotlb_force_bounce;

swiotlb_print_info();
return 0;

error_slots:
- free_pages((unsigned long)mem->areas, area_order);
+ free_pages((unsigned long)zone->areas, area_order);
error_area:
free_pages((unsigned long)vstart, order);
return -ENOMEM;
@@ -464,33 +532,40 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
void __init swiotlb_exit(void)
{
struct io_tlb_mem *mem = &io_tlb_default_mem;
+ int nr_zone = mem->has_extra ? SWIOTLB_NR : SWIOTLB_NR - 1;
unsigned long tbl_vaddr;
size_t tbl_size, slots_size;
unsigned int area_order;
+ int i;

if (swiotlb_force_bounce)
return;

- if (!mem->nslabs)
- return;
+ for (i = 0 ; i < nr_zone ; i++) {
+ struct io_tlb_zone *zone = &mem->zone[i];

- pr_info("tearing down default memory pool\n");
- tbl_vaddr = (unsigned long)phys_to_virt(mem->start);
- tbl_size = PAGE_ALIGN(mem->end - mem->start);
- slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs));
-
- set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
- if (mem->late_alloc) {
- area_order = get_order(array_size(sizeof(*mem->areas),
- mem->nareas));
- free_pages((unsigned long)mem->areas, area_order);
- free_pages(tbl_vaddr, get_order(tbl_size));
- free_pages((unsigned long)mem->slots, get_order(slots_size));
- } else {
- memblock_free_late(__pa(mem->areas),
- array_size(sizeof(*mem->areas), mem->nareas));
- memblock_free_late(mem->start, tbl_size);
- memblock_free_late(__pa(mem->slots), slots_size);
+ if (!zone->nslabs)
+ continue;
+
+ pr_info("tearing down %s memory pool\n",
+ i == 0 ? "default" : "extra");
+ tbl_vaddr = (unsigned long)phys_to_virt(zone->start);
+ tbl_size = PAGE_ALIGN(zone->end - zone->start);
+ slots_size = PAGE_ALIGN(array_size(sizeof(*zone->slots), zone->nslabs));
+
+ set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
+ if (mem->late_alloc) {
+ area_order = get_order(array_size(sizeof(*zone->areas),
+ zone->nareas));
+ free_pages((unsigned long)zone->areas, area_order);
+ free_pages(tbl_vaddr, get_order(tbl_size));
+ free_pages((unsigned long)zone->slots, get_order(slots_size));
+ } else {
+ memblock_free_late(__pa(zone->areas),
+ array_size(sizeof(*zone->areas), zone->nareas));
+ memblock_free_late(zone->start, tbl_size);
+ memblock_free_late(__pa(zone->slots), slots_size);
+ }
}

memset(mem, 0, sizeof(*mem));
@@ -507,15 +582,15 @@ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
/*
* Bounce: copy the swiotlb buffer from or back to the original dma location
*/
-static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
+static void swiotlb_bounce(struct device *dev, struct io_tlb_zone *zone,
+ phys_addr_t tlb_addr, size_t size,
enum dma_data_direction dir)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
- int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
- phys_addr_t orig_addr = mem->slots[index].orig_addr;
- size_t alloc_size = mem->slots[index].alloc_size;
+ int index = (tlb_addr - zone->start) >> IO_TLB_SHIFT;
+ phys_addr_t orig_addr = zone->slots[index].orig_addr;
+ size_t alloc_size = zone->slots[index].alloc_size;
unsigned long pfn = PFN_DOWN(orig_addr);
- unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
+ unsigned char *vaddr = zone->vaddr + tlb_addr - zone->start;
unsigned int tlb_offset, orig_addr_offset;

if (orig_addr == INVALID_PHYS_ADDR)
@@ -579,7 +654,10 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
}
}

-#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT))
+static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
+{
+ return start + (idx << IO_TLB_SHIFT);
+}

/*
* Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
@@ -591,9 +669,10 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask)
return nr_slots(boundary_mask + 1);
}

-static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index)
+static unsigned int wrap_area_index(struct io_tlb_zone *zone,
+ unsigned int index)
{
- if (index >= mem->area_nslabs)
+ if (index >= zone->area_nslabs)
return 0;
return index;
}
@@ -602,15 +681,15 @@ static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index)
* Find a suitable number of IO TLB entries size that will fit this request and
* allocate a buffer from that IO TLB pool.
*/
-static int swiotlb_do_find_slots(struct device *dev, int area_index,
+static int swiotlb_do_find_slots(struct device *dev,
+ struct io_tlb_zone *zone, int area_index,
phys_addr_t orig_addr, size_t alloc_size,
unsigned int alloc_align_mask)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
- struct io_tlb_area *area = mem->areas + area_index;
+ struct io_tlb_area *area = zone->areas + area_index;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
- phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
+ phys_to_dma_unencrypted(dev, zone->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
unsigned int iotlb_align_mask =
dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
@@ -622,7 +701,7 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
unsigned int slot_index;

BUG_ON(!nslots);
- BUG_ON(area_index >= mem->nareas);
+ BUG_ON(area_index >= zone->nareas);

/*
* For mappings with an alignment requirement don't bother looping to
@@ -635,11 +714,11 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1);

spin_lock_irqsave(&area->lock, flags);
- if (unlikely(nslots > mem->area_nslabs - area->used))
+ if (unlikely(nslots > zone->area_nslabs - area->used))
goto not_found;

- slot_base = area_index * mem->area_nslabs;
- index = wrap = wrap_area_index(mem, ALIGN(area->index, stride));
+ slot_base = area_index * zone->area_nslabs;
+ index = wrap = wrap_area_index(zone, ALIGN(area->index, stride));

do {
slot_index = slot_base + index;
@@ -647,7 +726,7 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
if (orig_addr &&
(slot_addr(tbl_dma_addr, slot_index) &
iotlb_align_mask) != (orig_addr & iotlb_align_mask)) {
- index = wrap_area_index(mem, index + 1);
+ index = wrap_area_index(zone, index + 1);
continue;
}

@@ -659,10 +738,10 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
if (!iommu_is_span_boundary(slot_index, nslots,
nr_slots(tbl_dma_addr),
max_slots)) {
- if (mem->slots[slot_index].list >= nslots)
+ if (zone->slots[slot_index].list >= nslots)
goto found;
}
- index = wrap_area_index(mem, index + stride);
+ index = wrap_area_index(zone, index + stride);
} while (index != wrap);

not_found:
@@ -671,19 +750,19 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,

found:
for (i = slot_index; i < slot_index + nslots; i++) {
- mem->slots[i].list = 0;
- mem->slots[i].alloc_size = alloc_size - (offset +
+ zone->slots[i].list = 0;
+ zone->slots[i].alloc_size = alloc_size - (offset +
((i - slot_index) << IO_TLB_SHIFT));
}
for (i = slot_index - 1;
io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
- mem->slots[i].list; i--)
- mem->slots[i].list = ++count;
+ zone->slots[i].list; i--)
+ zone->slots[i].list = ++count;

/*
* Update the indices to avoid searching in the next round.
*/
- if (index + nslots < mem->area_nslabs)
+ if (index + nslots < zone->area_nslabs)
area->index = index + nslots;
else
area->index = 0;
@@ -692,32 +771,33 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
return slot_index;
}

-static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+static int swiotlb_find_slots(struct device *dev,
+ struct io_tlb_zone *zone, phys_addr_t orig_addr,
size_t alloc_size, unsigned int alloc_align_mask)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
- int start = raw_smp_processor_id() & (mem->nareas - 1);
+ int start = raw_smp_processor_id() & (zone->nareas - 1);
int i = start, index;

do {
- index = swiotlb_do_find_slots(dev, i, orig_addr, alloc_size,
+ index = swiotlb_do_find_slots(dev, zone, i,
+ orig_addr, alloc_size,
alloc_align_mask);
if (index >= 0)
return index;
- if (++i >= mem->nareas)
+ if (++i >= zone->nareas)
i = 0;
} while (i != start);

return -1;
}

-static unsigned long mem_used(struct io_tlb_mem *mem)
+static unsigned long mem_used(struct io_tlb_zone *zone)
{
int i;
unsigned long used = 0;

- for (i = 0; i < mem->nareas; i++)
- used += mem->areas[i].used;
+ for (i = 0; i < zone->nareas; i++)
+ used += zone->areas[i].used;
return used;
}

@@ -728,11 +808,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
+ struct io_tlb_zone *zone;
unsigned int i;
int index;
phys_addr_t tlb_addr;
+ unsigned long dma_mask;

- if (!mem || !mem->nslabs)
+ if (!mem || !mem->zone[SWIOTLB_DF].nslabs)
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");

if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
@@ -744,13 +826,20 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return (phys_addr_t)DMA_MAPPING_ERROR;
}

- index = swiotlb_find_slots(dev, orig_addr,
+ dma_mask = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
+ if (mem->has_extra && mem->zone[SWIOTLB_EX].nslabs &&
+ dma_mask == DMA_BIT_MASK(64))
+ zone = &mem->zone[SWIOTLB_EX];
+ else
+ zone = &mem->zone[SWIOTLB_DF];
+
+ index = swiotlb_find_slots(dev, zone, orig_addr,
alloc_size + offset, alloc_align_mask);
if (index == -1) {
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- alloc_size, mem->nslabs, mem_used(mem));
+ alloc_size, zone->nslabs, mem_used(zone));
return (phys_addr_t)DMA_MAPPING_ERROR;
}

@@ -760,8 +849,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
* needed.
*/
for (i = 0; i < nr_slots(alloc_size + offset); i++)
- mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
- tlb_addr = slot_addr(mem->start, index) + offset;
+ zone->slots[index + i].orig_addr = slot_addr(orig_addr, i);
+ tlb_addr = slot_addr(zone->start, index) + offset;
/*
* When dir == DMA_FROM_DEVICE we could omit the copy from the orig
* to the tlb buffer, if we knew for sure the device will
@@ -769,19 +858,19 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
* unconditional bounce may prevent leaking swiotlb content (i.e.
* kernel memory) to user-space.
*/
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ swiotlb_bounce(dev, zone, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
}

-static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
+static void swiotlb_release_slots(struct device *dev,
+ struct io_tlb_zone *zone, phys_addr_t tlb_addr)
{
- struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long flags;
unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
- int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
- int nslots = nr_slots(mem->slots[index].alloc_size + offset);
- int aindex = index / mem->area_nslabs;
- struct io_tlb_area *area = &mem->areas[aindex];
+ int index = (tlb_addr - offset - zone->start) >> IO_TLB_SHIFT;
+ int nslots = nr_slots(zone->slots[index].alloc_size + offset);
+ int aindex = index / zone->area_nslabs;
+ struct io_tlb_area *area = &zone->areas[aindex];
int count, i;

/*
@@ -790,11 +879,11 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
* While returning the entries to the free list, we merge the entries
* with slots below and above the pool being returned.
*/
- BUG_ON(aindex >= mem->nareas);
+ BUG_ON(aindex >= zone->nareas);

spin_lock_irqsave(&area->lock, flags);
if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
- count = mem->slots[index + nslots].list;
+ count = zone->slots[index + nslots].list;
else
count = 0;

@@ -803,9 +892,9 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
* superceeding slots
*/
for (i = index + nslots - 1; i >= index; i--) {
- mem->slots[i].list = ++count;
- mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
- mem->slots[i].alloc_size = 0;
+ zone->slots[i].list = ++count;
+ zone->slots[i].orig_addr = INVALID_PHYS_ADDR;
+ zone->slots[i].alloc_size = 0;
}

/*
@@ -813,9 +902,9 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
* available (non zero)
*/
for (i = index - 1;
- io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && zone->slots[i].list;
i--)
- mem->slots[i].list = ++count;
+ zone->slots[i].list = ++count;
area->used -= nslots;
spin_unlock_irqrestore(&area->lock, flags);
}
@@ -827,21 +916,30 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
size_t mapping_size, enum dma_data_direction dir,
unsigned long attrs)
{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ int zone_index = swiotlb_zone_index(mem, tlb_addr);
+ struct io_tlb_zone *zone = &mem->zone[zone_index];
+
/*
* First, sync the memory before unmapping the entry
*/
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+ swiotlb_bounce(dev, zone, tlb_addr, mapping_size,
+ DMA_FROM_DEVICE);

- swiotlb_release_slots(dev, tlb_addr);
+ swiotlb_release_slots(dev, zone, tlb_addr);
}

void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ int zone_index = swiotlb_zone_index(mem, tlb_addr);
+ struct io_tlb_zone *zone = &mem->zone[zone_index];
+
if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+ swiotlb_bounce(dev, zone, tlb_addr, size, DMA_TO_DEVICE);
else
BUG_ON(dir != DMA_FROM_DEVICE);
}
@@ -849,8 +947,12 @@ void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ int zone_index = swiotlb_zone_index(mem, tlb_addr);
+ struct io_tlb_zone *zone = &mem->zone[zone_index];
+
if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE);
+ swiotlb_bounce(dev, zone, tlb_addr, size, DMA_FROM_DEVICE);
else
BUG_ON(dir != DMA_TO_DEVICE);
}
@@ -908,13 +1010,14 @@ bool is_swiotlb_active(struct device *dev)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;

- return mem && mem->nslabs;
+ return mem && mem->zone[SWIOTLB_DF].nslabs;
}
EXPORT_SYMBOL_GPL(is_swiotlb_active);

static int io_tlb_used_get(void *data, u64 *val)
{
- *val = mem_used(&io_tlb_default_mem);
+ struct io_tlb_zone *zone = (struct io_tlb_zone *) data;
+ *val = mem_used(zone);
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n");
@@ -922,13 +1025,26 @@ static int io_tlb_used_get(void *data, u64 *val)
static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
const char *dirname)
{
+ struct io_tlb_zone *zone;
+
mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
- if (!mem->nslabs)
- return;

- debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
- debugfs_create_file("io_tlb_used", 0400, mem->debugfs, NULL,
- &fops_io_tlb_used);
+ zone = &mem->zone[SWIOTLB_DF];
+ if (zone->nslabs) {
+ debugfs_create_ulong("io_tlb_nslabs", 0400,
+ mem->debugfs, &zone->nslabs);
+ debugfs_create_file("io_tlb_used", 0400, mem->debugfs,
+ zone, &fops_io_tlb_used);
+ }
+
+ if (mem->has_extra) {
+ zone = &mem->zone[SWIOTLB_EX];
+
+ debugfs_create_ulong("io_tlb_nslabs_extra", 0400,
+ mem->debugfs, &zone->nslabs);
+ debugfs_create_file("io_tlb_used_extra", 0400,
+ mem->debugfs, zone, &fops_io_tlb_used);
+ }
}

static int __init __maybe_unused swiotlb_create_default_debugfs(void)
@@ -946,17 +1062,20 @@ static int __init __maybe_unused swiotlb_create_default_debugfs(void)
struct page *swiotlb_alloc(struct device *dev, size_t size)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_zone *zone;
phys_addr_t tlb_addr;
int index;

if (!mem)
return NULL;

- index = swiotlb_find_slots(dev, 0, size, 0);
+ zone = &mem->zone[SWIOTLB_DF];
+
+ index = swiotlb_find_slots(dev, zone, 0, size, 0);
if (index == -1)
return NULL;

- tlb_addr = slot_addr(mem->start, index);
+ tlb_addr = slot_addr(zone->start, index);

return pfn_to_page(PFN_DOWN(tlb_addr));
}
@@ -964,11 +1083,14 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
bool swiotlb_free(struct device *dev, struct page *page, size_t size)
{
phys_addr_t tlb_addr = page_to_phys(page);
+ struct io_tlb_mem *mem = &dev->dma_io_tlb_mem;
+ int zone_index = swiotlb_zone_index(mem, tlb_addr);
+ struct io_tlb_zone *zone = mem->zone[zone_index];

if (!is_swiotlb_buffer(dev, tlb_addr))
return false;

- swiotlb_release_slots(dev, tlb_addr);
+ swiotlb_release_slots(dev, zone, tlb_addr);

return true;
}
@@ -977,6 +1099,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
struct device *dev)
{
struct io_tlb_mem *mem = rmem->priv;
+ struct io_tlb_zone *zone;
unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;

/* Set Per-device io tlb area to one */
@@ -992,25 +1115,29 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
if (!mem)
return -ENOMEM;

- mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL);
- if (!mem->slots) {
+ zone = &mem->zone[SWIOTLB_DF];
+
+ zone->slots = kcalloc(nslabs, sizeof(*zone->slots), GFP_KERNEL);
+ if (!zone->slots) {
kfree(mem);
return -ENOMEM;
}

- mem->areas = kcalloc(nareas, sizeof(*mem->areas),
+ zone->areas = kcalloc(nareas, sizeof(*zone->areas),
GFP_KERNEL);
- if (!mem->areas) {
- kfree(mem->slots);
+ if (!zone->areas) {
+ kfree(zone->slots);
kfree(mem);
return -ENOMEM;
}

set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
rmem->size >> PAGE_SHIFT);
- swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE,
- false, nareas);
+ swiotlb_init_io_tlb_zone(zone, rmem->base, nslabs, nareas);
mem->for_alloc = true;
+ mem->has_extra = false;
+ mem->late_alloc = false;
+ mem->force_bounce = flags & SWIOTLB_FORCE;

rmem->priv = mem;

--
1.8.3.1


2022-09-20 08:12:09

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH RFC v2 1/1] wiotlb: split buffer into 32-bit default and 64-bit extra zones

On Sat, Aug 20, 2022 at 12:42:50AM -0700, Dongli Zhang wrote:
> Hello,
>
> I used to send out RFC v1 to introduce an extra io_tlb_mem (created with
> SWIOTLB_ANY) in addition to the default io_tlb_mem (32-bit). The
> dev->dma_io_tlb_mem is set to either default or the extra io_tlb_mem,
> depending on dma mask. However, that is not good for setting
> dev->dma_io_tlb_mem at swiotlb layer transparently as suggested by
> Christoph Hellwig.
>
> https://lore.kernel.org/all/[email protected]/
>
> Therefore, this is another RFC v2 implementation following a different
> direction. The core ideas are:
>
> 1. The swiotlb is splited into two zones, io_tlb_mem->zone[0] (32-bit) and
> io_tlb_mem->zone[1] (64-bit).
>
> struct io_tlb_mem {
> struct io_tlb_zone zone[SWIOTLB_NR];
> struct dentry *debugfs;
> bool late_alloc;
> bool force_bounce;
> bool for_alloc;
> bool has_extra;
> };
>
> struct io_tlb_zone {
> phys_addr_t start;
> phys_addr_t end;
> void *vaddr;
> unsigned long nslabs;
> unsigned long used;
> unsigned int nareas;
> unsigned int area_nslabs;
> struct io_tlb_area *areas;
> struct io_tlb_slot *slots;
> };
>
> 2. By default, only io_tlb_mem->zone[0] is available. The
> io_tlb_mem->zone[1] is allocated conditionally if:
>
> - the "swiotlb=" is configured to allocate extra buffer, and
> - the SWIOTLB_EXTRA is set in the flag (this is to make sure arch(s) other
> than x86/sev/xen will not enable it until it is fully tested by each
> arch, e.g., mips/powerpc). Currently it is enabled for x86 and xen.
>
> 3. During swiotlb map, whether zone[0] (32-bit) or zone[1] (64-bit
> SWIOTLB_ANY)
> is used depends on min_not_zero(*dev->dma_mask, dev->bus_dma_limit).
>
> To test the RFC v2, here is the QEMU command line.
>
> qemu-system-x86_64 -smp 8 -m 32G -enable-kvm -vnc :5 -hda disk.img \
> -kernel path-to-linux/arch/x86_64/boot/bzImage \
> -append "root=/dev/sda1 init=/sbin/init text console=ttyS0 loglevel=7 swiotlb=32768,4194304,force" \
> -net nic -net user,hostfwd=tcp::5025-:22 \
> -device nvme,drive=nvme01,serial=helloworld -drive file=test.qcow2,if=none,id=nvme01 \
> -serial stdio
>
> There is below in syslog. The extra 8GB buffer is allocated.
>
> [ 0.152251] software IO TLB: area num 8.
> ... ...
> [ 3.706088] PCI-DMA: Using software bounce buffering for IO (SWIOTLB)
> [ 3.707334] software IO TLB: mapped default [mem 0x00000000bbfd7000-0x00000000bffd7000] (64MB)
> [ 3.708585] software IO TLB: mapped extra [mem 0x000000061cc00000-0x000000081cc00000] (8192MB)
>
> After the FIO is triggered over NVMe, the 64-bit buffer is used.
>
> $ cat /sys/kernel/debug/swiotlb/io_tlb_nslabs_extra
> 4194304
> $ cat /sys/kernel/debug/swiotlb/io_tlb_used_extra
> 327552
>
> Would you mind helping if this is the right direction to go?
>
> Thank you very much!
>
> Cc: Konrad Wilk <[email protected]>
> Cc: Joe Jin <[email protected]>
> Signed-off-by: Dongli Zhang <[email protected]>
> ---
> arch/arm/xen/mm.c | 2 +-
> arch/mips/pci/pci-octeon.c | 5 +-
> arch/x86/include/asm/xen/swiotlb-xen.h | 2 +-
> arch/x86/kernel/pci-dma.c | 6 +-
> drivers/xen/swiotlb-xen.c | 18 +-
> include/linux/swiotlb.h | 73 +++--

> diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
> index 3d826c0..4edfa42 100644
> --- a/arch/arm/xen/mm.c
> +++ b/arch/arm/xen/mm.c
> @@ -125,7 +125,7 @@ static int __init xen_mm_init(void)
> return 0;
>
> /* we can work with the default swiotlb */
> - if (!io_tlb_default_mem.nslabs) {
> + if (!io_tlb_default_mem.zone[SWIOTLB_DF].nslabs) {
> rc = swiotlb_init_late(swiotlb_size_or_default(),
> xen_swiotlb_gfp(), NULL);
> if (rc < 0)

First thing we need before doing anything about multiple default
pools is to get all the knowledge of details hidden inside swiotlb.c.

For swiotlb_init_late that seems easy as we can just move the check
into it.

> diff --git a/arch/mips/pci/pci-octeon.c b/arch/mips/pci/pci-octeon.c
> index e457a18..0bf0859 100644
> --- a/arch/mips/pci/pci-octeon.c
> +++ b/arch/mips/pci/pci-octeon.c
> @@ -654,6 +654,9 @@ static int __init octeon_pci_setup(void)
> octeon_pci_mem_resource.end =
> octeon_pci_mem_resource.start + (1ul << 30);
> } else {
> + struct io_tlb_mem *mem = &io_tlb_default_mem;
> + struct io_tlb_zone *zone = &mem->zone[SWIOTLB_DF];
> +
> /* Remap the Octeon BAR 0 to map 128MB-(128MB+4KB) */
> octeon_npi_write32(CVMX_NPI_PCI_CFG04, 128ul << 20);
> octeon_npi_write32(CVMX_NPI_PCI_CFG05, 0);
> @@ -664,7 +667,7 @@ static int __init octeon_pci_setup(void)
>
> /* BAR1 movable regions contiguous to cover the swiotlb */
> octeon_bar1_pci_phys =
> - io_tlb_default_mem.start & ~((1ull << 22) - 1);
> + zone->start & ~((1ull << 22) - 1);

But we'll need to do something about this mess. I'll need help from
the octeon maintainer on it.

> - x86_swiotlb_flags |= SWIOTLB_ANY;
> + x86_swiotlb_flags |= SWIOTLB_ANY | SWIOTLB_EXTRA;

I don't think this is how it is suppoed to be. SWIOTLB_ANY already
says give me a pool with no addressing constrains. We don't need
two pools without that. EXTRA is also not exactly a very helpful
name here.

> #ifdef CONFIG_X86
> -int xen_swiotlb_fixup(void *buf, unsigned long nslabs)
> +int xen_swiotlb_fixup(void *buf, unsigned long nslabs, unsigned int flags)
> {
> int rc;
> unsigned int order = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT);
> unsigned int i, dma_bits = order + PAGE_SHIFT;
> dma_addr_t dma_handle;
> phys_addr_t p = virt_to_phys(buf);
> + unsigned int max_dma_bits = 32;

I think live will be a lot simple if the addressing bits are passed to
this function, and not some kind of flags.

> +#define SWIOTLB_DF 0
> +#define SWIOTLB_EX 1
> +#define SWIOTLB_NR 2

These names are not very descriptive.