2023-08-14 18:23:51

by Robin Murphy

[permalink] [raw]
Subject: [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible

Hi all,

Prompted by [1], which reminded me I started this a while ago, I've now
finished off my own attempt at sorting out the horrid lack of rcache
scalability. It's become quite clear that given the vast range of system
sizes and workloads there is no right size for a fixed depot array, so I
reckon we're better off not having one at all.

Note that the reclaim threshold and rate are chosen fairly arbitrarily -
it's enough of a challenge to get my 4-core dev board with spinning disk
and gigabit ethernet to push anything into a depot at all :)

Thanks,
Robin.

[1] https://lore.kernel.org/linux-iommu/[email protected]


Robin Murphy (2):
iommu/iova: Make the rcache depot scale better
iommu/iova: Manage the depot list size

drivers/iommu/iova.c | 94 ++++++++++++++++++++++++++++++--------------
1 file changed, 65 insertions(+), 29 deletions(-)

--
2.39.2.101.g768bb238c484.dirty



2023-08-14 19:02:45

by Robin Murphy

[permalink] [raw]
Subject: [PATCH 1/2] iommu/iova: Make the rcache depot scale better

The algorithm in the original paper specifies the storage of full
magazines in the depot as an unbounded list rather than a fixed-size
array. It turns out to be pretty straightforward to do this in our
implementation with no significant loss of efficiency. This allows
the depot to scale up to the working set sizes of larger systems,
while also potentially saving some memory on smaller ones too.

Signed-off-by: Robin Murphy <[email protected]>
---
drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
1 file changed, 36 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 10b964600948..d2de6fb0e9f4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -625,10 +625,16 @@ EXPORT_SYMBOL_GPL(reserve_iova);
* will be wasted.
*/
#define IOVA_MAG_SIZE 127
-#define MAX_GLOBAL_MAGS 32 /* magazines per bin */

struct iova_magazine {
- unsigned long size;
+ /*
+ * Only full magazines are inserted into the depot, so we can avoid
+ * a separate list head and preserve maximum space-efficiency.
+ */
+ union {
+ unsigned long size;
+ struct iova_magazine *next;
+ };
unsigned long pfns[IOVA_MAG_SIZE];
};

@@ -640,8 +646,7 @@ struct iova_cpu_rcache {

struct iova_rcache {
spinlock_t lock;
- unsigned long depot_size;
- struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+ struct iova_magazine *depot;
struct iova_cpu_rcache __percpu *cpu_rcaches;
};

@@ -717,6 +722,21 @@ static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
mag->pfns[mag->size++] = pfn;
}

+static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
+{
+ struct iova_magazine *mag = rcache->depot;
+
+ rcache->depot = mag->next;
+ mag->size = IOVA_MAG_SIZE;
+ return mag;
+}
+
+static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *mag)
+{
+ mag->next = rcache->depot;
+ rcache->depot = mag;
+}
+
int iova_domain_init_rcaches(struct iova_domain *iovad)
{
unsigned int cpu;
@@ -734,7 +754,6 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)

rcache = &iovad->rcaches[i];
spin_lock_init(&rcache->lock);
- rcache->depot_size = 0;
rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
cache_line_size());
if (!rcache->cpu_rcaches) {
@@ -776,7 +795,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
struct iova_rcache *rcache,
unsigned long iova_pfn)
{
- struct iova_magazine *mag_to_free = NULL;
struct iova_cpu_rcache *cpu_rcache;
bool can_insert = false;
unsigned long flags;
@@ -794,12 +812,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,

if (new_mag) {
spin_lock(&rcache->lock);
- if (rcache->depot_size < MAX_GLOBAL_MAGS) {
- rcache->depot[rcache->depot_size++] =
- cpu_rcache->loaded;
- } else {
- mag_to_free = cpu_rcache->loaded;
- }
+ iova_depot_push(rcache, cpu_rcache->loaded);
spin_unlock(&rcache->lock);

cpu_rcache->loaded = new_mag;
@@ -812,11 +825,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,

spin_unlock_irqrestore(&cpu_rcache->lock, flags);

- if (mag_to_free) {
- iova_magazine_free_pfns(mag_to_free, iovad);
- iova_magazine_free(mag_to_free);
- }
-
return can_insert;
}

@@ -854,9 +862,9 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
has_pfn = true;
} else {
spin_lock(&rcache->lock);
- if (rcache->depot_size > 0) {
+ if (rcache->depot) {
iova_magazine_free(cpu_rcache->loaded);
- cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+ cpu_rcache->loaded = iova_depot_pop(rcache);
has_pfn = true;
}
spin_unlock(&rcache->lock);
@@ -894,10 +902,10 @@ static void free_iova_rcaches(struct iova_domain *iovad)
{
struct iova_rcache *rcache;
struct iova_cpu_rcache *cpu_rcache;
+ struct iova_magazine *mag;
unsigned int cpu;
- int i, j;

- for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+ for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
rcache = &iovad->rcaches[i];
if (!rcache->cpu_rcaches)
break;
@@ -907,8 +915,8 @@ static void free_iova_rcaches(struct iova_domain *iovad)
iova_magazine_free(cpu_rcache->prev);
}
free_percpu(rcache->cpu_rcaches);
- for (j = 0; j < rcache->depot_size; ++j)
- iova_magazine_free(rcache->depot[j]);
+ while ((mag = iova_depot_pop(rcache)))
+ iova_magazine_free(mag);
}

kfree(iovad->rcaches);
@@ -941,17 +949,16 @@ static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
static void free_global_cached_iovas(struct iova_domain *iovad)
{
struct iova_rcache *rcache;
+ struct iova_magazine *mag;
unsigned long flags;
- int i, j;

- for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+ for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
rcache = &iovad->rcaches[i];
spin_lock_irqsave(&rcache->lock, flags);
- for (j = 0; j < rcache->depot_size; ++j) {
- iova_magazine_free_pfns(rcache->depot[j], iovad);
- iova_magazine_free(rcache->depot[j]);
+ while ((mag = iova_depot_pop(rcache))) {
+ iova_magazine_free_pfns(mag, iovad);
+ iova_magazine_free(mag);
}
- rcache->depot_size = 0;
spin_unlock_irqrestore(&rcache->lock, flags);
}
}
--
2.39.2.101.g768bb238c484.dirty


2023-08-14 20:35:23

by Robin Murphy

[permalink] [raw]
Subject: [PATCH 2/2] iommu/iova: Manage the depot list size

Automatically scaling the depot up to suit the peak capacity of a
workload is all well and good, but it would be nice to have a way to
scale it back down again if the workload changes. To that end, add
automatic reclaim that will gradually free unused magazines if the
depot size remains above a reasonable threshold for long enough.

Signed-off-by: Robin Murphy <[email protected]>
---
drivers/iommu/iova.c | 29 +++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d2de6fb0e9f4..76a7d694708e 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -11,6 +11,7 @@
#include <linux/smp.h>
#include <linux/bitops.h>
#include <linux/cpu.h>
+#include <linux/workqueue.h>

/* The anchor node sits above the top of the usable address space */
#define IOVA_ANCHOR ~0UL
@@ -626,6 +627,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
*/
#define IOVA_MAG_SIZE 127

+#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
+
struct iova_magazine {
/*
* Only full magazines are inserted into the depot, so we can avoid
@@ -646,8 +649,11 @@ struct iova_cpu_rcache {

struct iova_rcache {
spinlock_t lock;
+ unsigned int depot_size;
struct iova_magazine *depot;
struct iova_cpu_rcache __percpu *cpu_rcaches;
+ struct iova_domain *iovad;
+ struct delayed_work work;
};

static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
@@ -728,6 +734,7 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)

rcache->depot = mag->next;
mag->size = IOVA_MAG_SIZE;
+ rcache->depot_size--;
return mag;
}

@@ -735,6 +742,24 @@ static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *ma
{
mag->next = rcache->depot;
rcache->depot = mag;
+ rcache->depot_size++;
+}
+
+static void iova_depot_work_func(struct work_struct *work)
+{
+ struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work);
+ struct iova_magazine *mag = NULL;
+
+ spin_lock(&rcache->lock);
+ if (rcache->depot_size > num_online_cpus())
+ mag = iova_depot_pop(rcache);
+ spin_unlock(&rcache->lock);
+
+ if (mag) {
+ iova_magazine_free_pfns(mag, rcache->iovad);
+ iova_magazine_free(mag);
+ schedule_delayed_work(&rcache->work, msecs_to_jiffies(IOVA_DEPOT_DELAY));
+ }
}

int iova_domain_init_rcaches(struct iova_domain *iovad)
@@ -754,6 +779,8 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)

rcache = &iovad->rcaches[i];
spin_lock_init(&rcache->lock);
+ rcache->iovad = iovad;
+ INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func);
rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
cache_line_size());
if (!rcache->cpu_rcaches) {
@@ -814,6 +841,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
spin_lock(&rcache->lock);
iova_depot_push(rcache, cpu_rcache->loaded);
spin_unlock(&rcache->lock);
+ schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);

cpu_rcache->loaded = new_mag;
can_insert = true;
@@ -915,6 +943,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
iova_magazine_free(cpu_rcache->prev);
}
free_percpu(rcache->cpu_rcaches);
+ cancel_delayed_work_sync(&rcache->work);
while ((mag = iova_depot_pop(rcache)))
iova_magazine_free(mag);
}
--
2.39.2.101.g768bb238c484.dirty


2023-08-15 17:25:54

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible

On 15/08/2023 11:24 am, John Garry wrote:
> On 14/08/2023 18:53, Robin Murphy wrote:
>> Hi all,
>>
>
> Hi Robin,
>
>> Prompted by [1], which reminded me I started this a while ago, I've now
>> finished off my own attempt at sorting out the horrid lack of rcache
>> scalability. It's become quite clear that given the vast range of system
>> sizes and workloads there is no right size for a fixed depot array, so I
>> reckon we're better off not having one at all.
>>
>> Note that the reclaim threshold and rate are chosen fairly arbitrarily -
>
> This threshold is the number of online CPUs, right?

Yes, that's nominally half of the current fixed size (based on all the
performance figures from the original series seemingly coming from a
16-thread machine, but seemed like a fair compromise. I am of course
keen to see how real-world testing actually pans out.

>> it's enough of a challenge to get my 4-core dev board with spinning disk
>> and gigabit ethernet to push anything into a depot at all :)
>>
>
> I have to admit that I was hoping to also see a more aggressive reclaim
> strategy, where we also trim the per-CPU rcaches when not in use.
> Leizhen proposed something like this a long time ago.

Don't think I haven't been having various elaborate ideas for making it
cleverer with multiple thresholds and self-tuning, however I have
managed to restrain myself ;)

At this point I'm just looking to confirm whether the fundamental
concepts are sound, and at least no worse than the current behaviour
(hence keeping it split into 2 distinct patches for the sake of review
and debugging). If it proves solid then we can absolutely come back and
go to town on enhancements later.

Cheers,
Robin.

2023-08-17 08:28:25

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH 2/2] iommu/iova: Manage the depot list size

On 15/08/2023 3:11 pm, zhangzekun (A) wrote:
>
>
> 在 2023/8/15 1:53, Robin Murphy 写道:
>> Automatically scaling the depot up to suit the peak capacity of a
>> workload is all well and good, but it would be nice to have a way to
>> scale it back down again if the workload changes. To that end, add
>> automatic reclaim that will gradually free unused magazines if the
>> depot size remains above a reasonable threshold for long enough.
>>
>> Signed-off-by: Robin Murphy <[email protected]>
>> ---
>>   drivers/iommu/iova.c | 29 +++++++++++++++++++++++++++++
>>   1 file changed, 29 insertions(+)
>>
>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
>> index d2de6fb0e9f4..76a7d694708e 100644
>> --- a/drivers/iommu/iova.c
>> +++ b/drivers/iommu/iova.c
>> @@ -11,6 +11,7 @@
>>   #include <linux/smp.h>
>>   #include <linux/bitops.h>
>>   #include <linux/cpu.h>
>> +#include <linux/workqueue.h>
>>   /* The anchor node sits above the top of the usable address space */
>>   #define IOVA_ANCHOR    ~0UL
>> @@ -626,6 +627,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
>>    */
>>   #define IOVA_MAG_SIZE 127
>> +#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
>> +
>>   struct iova_magazine {
>>       /*
>>        * Only full magazines are inserted into the depot, so we can avoid
>> @@ -646,8 +649,11 @@ struct iova_cpu_rcache {
>>   struct iova_rcache {
>>       spinlock_t lock;
>> +    unsigned int depot_size;
>>       struct iova_magazine *depot;
>>       struct iova_cpu_rcache __percpu *cpu_rcaches;
>> +    struct iova_domain *iovad;
>> +    struct delayed_work work;
>>   };
>>   static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
>> @@ -728,6 +734,7 @@ static struct iova_magazine *iova_depot_pop(struct
>> iova_rcache *rcache)
>>       rcache->depot = mag->next;
>>       mag->size = IOVA_MAG_SIZE;
>> +    rcache->depot_size--;
>>       return mag;
>>   }
>> @@ -735,6 +742,24 @@ static void iova_depot_push(struct iova_rcache
>> *rcache, struct iova_magazine *ma
>>   {
>>       mag->next = rcache->depot;
>>       rcache->depot = mag;
>> +    rcache->depot_size++;
>> +}
>> +
>> +static void iova_depot_work_func(struct work_struct *work)
>> +{
>> +    struct iova_rcache *rcache = container_of(work, typeof(*rcache),
>> work.work);
>> +    struct iova_magazine *mag = NULL;
>> +
>> +    spin_lock(&rcache->lock);
>> +    if (rcache->depot_size > num_online_cpus())
>> +        mag = iova_depot_pop(rcache);
>> +    spin_unlock(&rcache->lock);
>> +
>> +    if (mag) {
>> +        iova_magazine_free_pfns(mag, rcache->iovad);
>> +        iova_magazine_free(mag);
>> +        schedule_delayed_work(&rcache->work,
>> msecs_to_jiffies(IOVA_DEPOT_DELAY));
> Hi, Robin,
>
> I am a little confused why IOVA_DEPOT_DELAY need to be calculated twice
> in iova_depot_work_func(), as it already equals to
> "msecs_to_jiffies(100)".

Oof, not sure how I managed to leave a mere 3-line refactoring
half-finished... yeah, this msecs_to_jiffies() just shouldn't be here :)

> Besides, do we really need to invoke a
> delayed_work in iova_depot_work_func()? As each time we put a iova
> magazine to depot, a delayed_work will be invoked which is reponsible to
> free a iova magazine in depot if the depot size is greater than
> num_online_cpus().

The idea is to free excess magazines one at a time at a relatively low
rate, so as not to interfere too much with "bursty" workloads which
might release a large number of IOVAs at once, but then want to
reallocate them again relatively soon. I'm hoping that the overhead of
scheduling the reclaim work unconditionally whenever the depot grows is
sufficiently negligible to avoid having to check the threshold in
multiple places, as that's the part which I anticipate might grow more
complex in future. As far as I could see it should be pretty minimal if
the work is already scheduled, which I'd expect to be the case most of
the time while the depot is busy. The reason the work also reschedules
itself is to handle the opposite situation, and make sure it can run to
completion after the depot goes idle.

Thanks,
Robin.

2023-08-19 12:47:30

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible

On 15/08/2023 2:35 pm, John Garry wrote:
> On 15/08/2023 12:11, Robin Murphy wrote:
>>>
>>> This threshold is the number of online CPUs, right?
>>
>> Yes, that's nominally half of the current fixed size (based on all the
>> performance figures from the original series seemingly coming from a
>> 16-thread machine,
>
> If you are talking about
> https://lore.kernel.org/linux-iommu/[email protected]/,

No, I mean the *original* rcache patch submission, and its associated paper:

https://lore.kernel.org/linux-iommu/[email protected]/

> then I think it's a 256-CPU system and the DMA controller has 16 HW queues. The 16 HW queues are relevant as the per-completion queue interrupt handler runs on a fixed CPU from the set of 16 CPUs in the HW queue interrupt handler affinity mask. And what this means is while any CPU may alloc an IOVA, only those 16 CPUs handling each HW queue interrupt will be free'ing IOVAs.
>
>> but seemed like a fair compromise. I am of course keen to see how
>> real-world testing actually pans out.
>>
>>>> it's enough of a challenge to get my 4-core dev board with spinning
>>>> disk
>>>> and gigabit ethernet to push anything into a depot at all ????
>>>>
>>>
>>> I have to admit that I was hoping to also see a more aggressive
>>> reclaim strategy, where we also trim the per-CPU rcaches when not in
>>> use. Leizhen proposed something like this a long time ago.
>>
>> Don't think I haven't been having various elaborate ideas for making
>> it cleverer with multiple thresholds and self-tuning, however I have
>> managed to restrain myself ????
>>
>
> OK, understood. My main issue WRT scalability is that the total
> cacheable IOVAs (CPU and depot rcache) scales up with the number of
> CPUs, but many DMA controllers have a fixed number of max in-flight
> requests.
>
> Consider a SCSI storage controller on a 256-CPU system. The in-flight
> limit for this example controller is 4096, which would typically never
> be even used up or may not be even usable.
>
> For this device, we need 4096 * 6 [IOVA rcache range] = ~24K cached
> IOVAs if we were to pre-allocate them all - obviously I am ignoring that
> we have the per-CPU rcache for speed and it would not make sense to
> share one set. However, according to current IOVA driver, we can in
> theory cache upto ((256 [CPUs] * 2 [loaded + prev]) + 32 [depot size]) *
> 6 [rcache range] * 128 (IOVA per mag) = ~420K IOVAs. That's ~17x what we
> would ever need.
>
> Something like NVMe is different, as its total requests can scale up
> with the CPU count, but only to a limit. I am not sure about network
> controllers.

Remember that this threshold only represents a point at which we
consider the cache to have grown "big enough" to start background
reclaim - over the short term it is neither an upper nor a lower limit
on the cache capacity itself. Indeed it will be larger than the working
set of some workloads, but then it still wants to be enough of a buffer
to be useful for others which do make big bursts of allocations only
periodically.

> Anyway, this is just something which I think should be considered -
> which I guess already has been.

Indeed, I would tend to assume that machines with hundreds of CPUs are
less likely to be constrained on overall memory and/or IOVA space, so
tuning for a more responsive cache should be more beneficial than any
potential wastage is detrimental.

Cheers,
Robin.

2023-08-19 23:42:31

by John Garry

[permalink] [raw]
Subject: Re: [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible

On 15/08/2023 12:11, Robin Murphy wrote:
>>
>> This threshold is the number of online CPUs, right?
>
> Yes, that's nominally half of the current fixed size (based on all the
> performance figures from the original series seemingly coming from a
> 16-thread machine,

If you are talking about
https://lore.kernel.org/linux-iommu/[email protected]/,
then I think it's a 256-CPU system and the DMA controller has 16 HW
queues. The 16 HW queues are relevant as the per-completion queue
interrupt handler runs on a fixed CPU from the set of 16 CPUs in the HW
queue interrupt handler affinity mask. And what this means is while any
CPU may alloc an IOVA, only those 16 CPUs handling each HW queue
interrupt will be free'ing IOVAs.

> but seemed like a fair compromise. I am of course
> keen to see how real-world testing actually pans out.
>
>>> it's enough of a challenge to get my 4-core dev board with spinning disk
>>> and gigabit ethernet to push anything into a depot at all ????
>>>
>>
>> I have to admit that I was hoping to also see a more aggressive
>> reclaim strategy, where we also trim the per-CPU rcaches when not in
>> use. Leizhen proposed something like this a long time ago.
>
> Don't think I haven't been having various elaborate ideas for making it
> cleverer with multiple thresholds and self-tuning, however I have
> managed to restrain myself ????
>

OK, understood. My main issue WRT scalability is that the total
cacheable IOVAs (CPU and depot rcache) scales up with the number of
CPUs, but many DMA controllers have a fixed number of max in-flight
requests.

Consider a SCSI storage controller on a 256-CPU system. The in-flight
limit for this example controller is 4096, which would typically never
be even used up or may not be even usable.

For this device, we need 4096 * 6 [IOVA rcache range] = ~24K cached
IOVAs if we were to pre-allocate them all - obviously I am ignoring that
we have the per-CPU rcache for speed and it would not make sense to
share one set. However, according to current IOVA driver, we can in
theory cache upto ((256 [CPUs] * 2 [loaded + prev]) + 32 [depot size]) *
6 [rcache range] * 128 (IOVA per mag) = ~420K IOVAs. That's ~17x what we
would ever need.

Something like NVMe is different, as its total requests can scale up
with the CPU count, but only to a limit. I am not sure about network
controllers.

Anyway, this is just something which I think should be considered -
which I guess already has been.

> At this point I'm just looking to confirm whether the fundamental
> concepts are sound, and at least no worse than the current behaviour
> (hence keeping it split into 2 distinct patches for the sake of review
> and debugging). If it proves solid then we can absolutely come back and
> go to town on enhancements later.

Thanks,
John

2023-08-20 17:04:26

by Jerry Snitselaar

[permalink] [raw]
Subject: Re: [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible

On Mon, Aug 14, 2023 at 06:53:32PM +0100, Robin Murphy wrote:
> Hi all,
>
> Prompted by [1], which reminded me I started this a while ago, I've now
> finished off my own attempt at sorting out the horrid lack of rcache
> scalability. It's become quite clear that given the vast range of system
> sizes and workloads there is no right size for a fixed depot array, so I
> reckon we're better off not having one at all.
>
> Note that the reclaim threshold and rate are chosen fairly arbitrarily -
> it's enough of a challenge to get my 4-core dev board with spinning disk
> and gigabit ethernet to push anything into a depot at all :)
>
> Thanks,
> Robin.
>
> [1] https://lore.kernel.org/linux-iommu/[email protected]
>
>
> Robin Murphy (2):
> iommu/iova: Make the rcache depot scale better
> iommu/iova: Manage the depot list size
>
> drivers/iommu/iova.c | 94 ++++++++++++++++++++++++++++++--------------
> 1 file changed, 65 insertions(+), 29 deletions(-)
>
> --
> 2.39.2.101.g768bb238c484.dirty
>

I'm trying to hunt down a system where we've seen some issues before,
but most of them have involved systems with nvme drives. Commit
3710e2b056cb ("nvme-pci: clamp max_hw_sectors based on DMA optimized
limitation") has helped those cases. I ran the patches overnight with
IOVA_DEPOT_DELAY fixed up on a couple of Genoa based systems (384
cores) without issue.

Regards,
Jerry


2023-08-21 11:11:00

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH 1/2] iommu/iova: Make the rcache depot scale better

On 2023-08-21 09:11, Srivastava, Dheeraj Kumar wrote:
> Hello Robin,
>
> On 8/14/2023 11:23 PM, Robin Murphy wrote:
>> The algorithm in the original paper specifies the storage of full
>> magazines in the depot as an unbounded list rather than a fixed-size
>> array. It turns out to be pretty straightforward to do this in our
>> implementation with no significant loss of efficiency. This allows
>> the depot to scale up to the working set sizes of larger systems,
>> while also potentially saving some memory on smaller ones too.
>>
>> Signed-off-by: Robin Murphy <[email protected]>
>> ---
>>   drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
>>   1 file changed, 36 insertions(+), 29 deletions(-)
>>
>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
>> index 10b964600948..d2de6fb0e9f4 100644
>> --- a/drivers/iommu/iova.c
>> +++ b/drivers/iommu/iova.c
>> @@ -625,10 +625,16 @@ EXPORT_SYMBOL_GPL(reserve_iova);
>>    * will be wasted.
>>    */
>>   #define IOVA_MAG_SIZE 127
>> -#define MAX_GLOBAL_MAGS 32    /* magazines per bin */
>>   struct iova_magazine {
>> -    unsigned long size;
>> +    /*
>> +     * Only full magazines are inserted into the depot, so we can avoid
>> +     * a separate list head and preserve maximum space-efficiency.
>> +     */
>> +    union {
>> +        unsigned long size;
>> +        struct iova_magazine *next;
>> +    };
>>       unsigned long pfns[IOVA_MAG_SIZE];
>>   };
>> @@ -640,8 +646,7 @@ struct iova_cpu_rcache {
>>   struct iova_rcache {
>>       spinlock_t lock;
>> -    unsigned long depot_size;
>> -    struct iova_magazine *depot[MAX_GLOBAL_MAGS];
>> +    struct iova_magazine *depot;
>>       struct iova_cpu_rcache __percpu *cpu_rcaches;
>>   };
>> @@ -717,6 +722,21 @@ static void iova_magazine_push(struct
>> iova_magazine *mag, unsigned long pfn)
>>       mag->pfns[mag->size++] = pfn;
>>   }
>> +static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
>> +{
>> +    struct iova_magazine *mag = rcache->depot;
>> +
>> +    rcache->depot = mag->next;
>
> While doing routine domain change test for a device ("unbind device from
> driver -> change domain of the device -> bind device back to the
> driver"), i ran into the following NULL pointer dereferencing issue.
>
> [  599.020261] BUG: kernel NULL pointer dereference, address:
> 0000000000000000
> [  599.020986] #PF: supervisor read access in kernel mode
> [  599.021588] #PF: error_code(0x0000) - not-present page
> [  599.022180] PGD 0 P4D 0
> [  599.022770] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [  599.023365] CPU: 68 PID: 3122 Comm: avocado Not tainted
> 6.5.0-rc6-ChngDomainIssue #16
> [  599.023970] Hardware name: Dell Inc. PowerEdge R6515/07PXPY, BIOS
> 2.3.6 07/06/2021
> [  599.024571] RIP: 0010:free_iova_rcaches+0x9c/0x110
> [  599.025170] Code: d1 ff 39 05 36 d2 bc 01 48 89 c3 77 b4 49 8b 7f 10
> e8 b8 69 93 ff 49 8d 7f 20 e8 6f e4 6b ff eb 05 e8 48 ba 93 ff 49 8b 7f
> 08 <48> 8b 07 49 89 47 08 48 c7 07 7f 00 00 00 41 83 6f 04 01 48 85 ff
> [  599.026436] RSP: 0018:ffffb78b4c9f7c68 EFLAGS: 00010296
> [  599.027075] RAX: ffffffff9fa8c100 RBX: 0000000000000080 RCX:
> 0000000000000005
> [  599.027719] RDX: 0000000000000000 RSI: 000000007fffffff RDI:
> 0000000000000000
> [  599.028359] RBP: ffffb78b4c9f7c98 R08: 0000000000000000 R09:
> 0000000000000006
> [  599.028995] R10: 0000000000000000 R11: 0000000000000000 R12:
> 0000000000000000
> [  599.029636] R13: ffff93910d9c6008 R14: ffffd78b3bde1000 R15:
> ffff9391144ebc00
> [  599.030283] FS:  00007fa5c9e5c000(0000) GS:ffff93cf72700000(0000)
> knlGS:0000000000000000
> [  599.030941] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  599.031588] CR2: 0000000000000000 CR3: 000000013f526006 CR4:
> 0000000000770ee0
> [  599.032237] PKRU: 55555554
> [  599.032878] Call Trace:
> [  599.033512]  <TASK>
> [  599.034140]  ? show_regs+0x6e/0x80
> [  599.034769]  ? __die+0x29/0x70
> [  599.035393]  ? page_fault_oops+0x154/0x4a0
> [  599.036021]  ? __x86_return_thunk+0x9/0x10
> [  599.036647]  ? do_user_addr_fault+0x318/0x6b0
> [  599.037258]  ? __x86_return_thunk+0x9/0x10
> [  599.037866]  ? __slab_free+0xc7/0x320
> [  599.038472]  ? exc_page_fault+0x7d/0x190
> [  599.039074]  ? asm_exc_page_fault+0x2b/0x30
> [  599.039683]  ? free_iova_rcaches+0x9c/0x110
> [  599.040286]  ? free_iova_rcaches+0x91/0x110
> [  599.040875]  ? __x86_return_thunk+0x9/0x10
> [  599.041460]  put_iova_domain+0x32/0xa0
> [  599.042041]  iommu_put_dma_cookie+0x177/0x1b0
> [  599.042620]  iommu_domain_free+0x1f/0x50
> [  599.043194]  iommu_setup_default_domain+0x2fb/0x420
> [  599.043774]  iommu_group_store_type+0xb6/0x210
> [  599.044362]  iommu_group_attr_store+0x21/0x40
> [  599.044938]  sysfs_kf_write+0x42/0x50
> [  599.045511]  kernfs_fop_write_iter+0x143/0x1d0
> [  599.046084]  vfs_write+0x2c2/0x3f0
> [  599.046653]  ksys_write+0x6b/0xf0
> [  599.047219]  __x64_sys_write+0x1d/0x30
> [  599.047782]  do_syscall_64+0x60/0x90
> [  599.048346]  ? syscall_exit_to_user_mode+0x2a/0x50
> [  599.048911]  ? __x64_sys_lseek+0x1c/0x30
> [  599.049465]  ? __x86_return_thunk+0x9/0x10
> [  599.050013]  ? do_syscall_64+0x6d/0x90
> [  599.050562]  ? do_syscall_64+0x6d/0x90
> [  599.051098]  ? do_syscall_64+0x6d/0x90
> [  599.051625]  ? __x86_return_thunk+0x9/0x10
> [  599.052149]  ? exc_page_fault+0x8e/0x190
> [  599.052665]  entry_SYSCALL_64_after_hwframe+0x73/0xdd
> [  599.053180] RIP: 0033:0x7fa5c9d14a6f
> [  599.053670] Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 19 c0 f7
> ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f
> 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 5c c0 f7 ff 48
> [  599.054672] RSP: 002b:00007ffdeb05f540 EFLAGS: 00000293 ORIG_RAX:
> 0000000000000001
> [  599.055182] RAX: ffffffffffffffda RBX: 0000557de3275a80 RCX:
> 00007fa5c9d14a6f
> [  599.055692] RDX: 0000000000000003 RSI: 0000557de418ff60 RDI:
> 0000000000000011
> [  599.056203] RBP: 0000557de39a25e0 R08: 0000000000000000 R09:
> 0000000000000000
> [  599.056718] R10: 0000000000000000 R11: 0000000000000293 R12:
> 0000000000000003
> [  599.057227] R13: 00007fa5c9e5bf80 R14: 0000000000000011 R15:
> 0000557de418ff60
> [  599.057738]  </TASK>
> [  599.058227] Modules linked in: xt_CHECKSUM xt_MASQUERADE xt_conntrack
> ipt_REJECT nf_reject_ipv4 xt_tcpudp nft_compat nft_chain_nat nf_nat
> nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge
> stp llc ipmi_ssif binfmt_misc nls_iso8859_1 intel_rapl_msr
> intel_rapl_common amd64_edac edac_mce_amd kvm_amd kvm dell_smbios dcdbas
> rapl dell_wmi_descriptor wmi_bmof joydev input_leds ccp ptdma k10temp
> acpi_ipmi ipmi_si acpi_power_meter mac_hid sch_fq_codel dm_multipath
> scsi_dh_rdac ipmi_devintf scsi_dh_emc ipmi_msghandler scsi_dh_alua msr
> ramoops reed_solomon pstore_blk pstore_zone efi_pstore ip_tables
> x_tables autofs4 hid_generic usbhid hid btrfs blake2b_generic raid10
> raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor
> raid6_pq libcrc32c raid1 raid0 multipath linear mgag200 i2c_algo_bit
> drm_shmem_helper drm_kms_helper crct10dif_pclmul crc32_pclmul
> ghash_clmulni_intel sha512_ssse3 aesni_intel crypto_simd cryptd nvme drm
> mpt3sas tg3 ahci nvme_core libahci xhci_pci raid_class i2c_piix4
> [  599.058423]  xhci_pci_renesas scsi_transport_sas wmi
> [  599.064210] CR2: 0000000000000000
> [  599.064841] ---[ end trace 0000000000000000 ]---
> --
>
> Looking at the RIP: free_iova_rcaches+0x9c/0x110 pointed me to the above
> line leading me to believe we are popping element from an empty stack.
> Following hunk fixed the issue for me:

Oh dear... looks like this was a brainfart when I factored out the
push/pop helpers to replace the original list_head-based prototype.
Thanks for the catch!

This fix is functionally fine, but I think what I'll do for v2 is
change those "while ((mag = iova_depot_pop(rcache)))" loops, since
assignment-in-the-loop-condition logic tends to look a bit suspect
anyway. Other than that, though, were you able to notice any difference
(good or bad) in CPU load or memory consumption overall?

Thanks,
Robin.

>
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index 76a7d694708e..899f1c2ba62a 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -732,6 +732,9 @@ static struct iova_magazine *iova_depot_pop(struct
> iova_rcache *rcache)
>  {
>      struct iova_magazine *mag = rcache->depot;
>
> +    if (!mag)
> +        return NULL;
> +
>      rcache->depot = mag->next;
>      mag->size = IOVA_MAG_SIZE;
>      rcache->depot_size--;
> --
>
>> +    mag->size = IOVA_MAG_SIZE;
>> +    return mag;
>> +}
>> +
>
> --
> Thanks and Regards
> Dheeraj Kumar Srivastava
>

2023-08-21 13:07:49

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH 1/2] iommu/iova: Make the rcache depot scale better

On 2023-08-21 13:02, John Garry wrote:
> On 14/08/2023 18:53, Robin Murphy wrote:
>> The algorithm in the original paper specifies the storage of full
>> magazines in the depot as an unbounded list rather than a fixed-size
>> array. It turns out to be pretty straightforward to do this in our
>> implementation with no significant loss of efficiency. This allows
>> the depot to scale up to the working set sizes of larger systems,
>> while also potentially saving some memory on smaller ones too.
>>
>> Signed-off-by: Robin Murphy <[email protected]>
>
> This looks ok (ignoring the crash reported), so feel free to add:
>
> Reviewed-by: John Garry <[email protected]>

Thanks!

> A small comment and question below.
>
>> ---
>>   drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
>>   1 file changed, 36 insertions(+), 29 deletions(-)
>>
>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
>> index 10b964600948..d2de6fb0e9f4 100644
>> --- a/drivers/iommu/iova.c
>> +++ b/drivers/iommu/iova.c
>> @@ -625,10 +625,16 @@ EXPORT_SYMBOL_GPL(reserve_iova);
>>    * will be wasted.
>>    */
>>   #define IOVA_MAG_SIZE 127
>> -#define MAX_GLOBAL_MAGS 32    /* magazines per bin */
>>   struct iova_magazine {
>> -    unsigned long size;
>> +    /*
>> +     * Only full magazines are inserted into the depot, so we can avoid
>> +     * a separate list head and preserve maximum space-efficiency.
>
> It might be worth explicitly mentioning that we try to keep total mag
> size as power-of-2

Sure, I can tie it in with the existing comment above, which might
actually end up more readable anyway.

>> +     */
>> +    union {
>> +        unsigned long size;
>> +        struct iova_magazine *next;
>> +    };
>>       unsigned long pfns[IOVA_MAG_SIZE];
>>   };
>> @@ -640,8 +646,7 @@ struct iova_cpu_rcache {
>>   struct iova_rcache {
>>       spinlock_t lock;
>> -    unsigned long depot_size;
>> -    struct iova_magazine *depot[MAX_GLOBAL_MAGS];
>> +    struct iova_magazine *depot;
>>       struct iova_cpu_rcache __percpu *cpu_rcaches;
>>   };
>> @@ -717,6 +722,21 @@ static void iova_magazine_push(struct
>> iova_magazine *mag, unsigned long pfn)
>>       mag->pfns[mag->size++] = pfn;
>>   }
>> +static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
>> +{
>> +    struct iova_magazine *mag = rcache->depot;
>> +
>> +    rcache->depot = mag->next;
>> +    mag->size = IOVA_MAG_SIZE;
>> +    return mag;
>> +}
>> +
>> +static void iova_depot_push(struct iova_rcache *rcache, struct
>> iova_magazine *mag)
>> +{
>> +    mag->next = rcache->depot;
>> +    rcache->depot = mag;
>> +}
>> +
>>   int iova_domain_init_rcaches(struct iova_domain *iovad)
>>   {
>>       unsigned int cpu;
>> @@ -734,7 +754,6 @@ int iova_domain_init_rcaches(struct iova_domain
>> *iovad)
>>           rcache = &iovad->rcaches[i];
>>           spin_lock_init(&rcache->lock);
>> -        rcache->depot_size = 0;
>>           rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
>>                                cache_line_size());
>>           if (!rcache->cpu_rcaches) {
>> @@ -776,7 +795,6 @@ static bool __iova_rcache_insert(struct
>> iova_domain *iovad,
>>                    struct iova_rcache *rcache,
>>                    unsigned long iova_pfn)
>>   {
>> -    struct iova_magazine *mag_to_free = NULL;
>>       struct iova_cpu_rcache *cpu_rcache;
>>       bool can_insert = false;
>>       unsigned long flags;
>> @@ -794,12 +812,7 @@ static bool __iova_rcache_insert(struct
>> iova_domain *iovad,
>>           if (new_mag) {
>>               spin_lock(&rcache->lock);
>> -            if (rcache->depot_size < MAX_GLOBAL_MAGS) {
>> -                rcache->depot[rcache->depot_size++] =
>> -                        cpu_rcache->loaded;
>> -            } else {
>> -                mag_to_free = cpu_rcache->loaded;
>> -            }
>> +            iova_depot_push(rcache, cpu_rcache->loaded);
>>               spin_unlock(&rcache->lock);
>
> Out of curiosity, do you know why we take the approach (prior to this
> change) to free the loaded mag and alloc a new empty mag? Why not
> instead just say that we can't insert and bail out?

I have a feeling it must have been mentioned at some point, since my
memory says there was a deliberate intent to keep the flow through the
critical section simple and consistent, and minimise time spent holding
the rcache lock, and I'm 99% sure that isn't my own inferred reasoning...

Cheers,
Robin.

>>               cpu_rcache->loaded = new_mag;
>> @@ -812,11 +825,6 @@ static bool __iova_rcache_insert(struct
>> iova_domain *iovad,
>>       spin_unlock_irqrestore(&cpu_rcache->lock, flags);
>> -    if (mag_to_free) {
>> -        iova_magazine_free_pfns(mag_to_free, iovad);
>> -        iova_magazine_free(mag_to_free);
>> -    }
>> -
>>       return can_insert;
>>   }
>> @@ -854,9 +862,9 @@ static unsigned long __iova_rcache_get(struct
>> iova_rcache *rcache,
>>           has_pfn = true;
>>       } else {
>>           spin_lock(&rcache->lock);
>> -        if (rcache->depot_size > 0) {
>> +        if (rcache->depot) {
>>               iova_magazine_free(cpu_rcache->loaded);
>> -            cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
>> +            cpu_rcache->loaded = iova_depot_pop(rcache);
>>               has_pfn = true;
>>           }
>>           spin_unlock(&rcache->lock);
>> @@ -894,10 +902,10 @@ static void free_iova_rcaches(struct iova_domain
>> *iovad)
>>   {
>>       struct iova_rcache *rcache;
>>       struct iova_cpu_rcache *cpu_rcache;
>> +    struct iova_magazine *mag;
>>       unsigned int cpu;
>> -    int i, j;
>> -    for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>> +    for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>>           rcache = &iovad->rcaches[i];
>>           if (!rcache->cpu_rcaches)
>>               break;
>> @@ -907,8 +915,8 @@ static void free_iova_rcaches(struct iova_domain
>> *iovad)
>>               iova_magazine_free(cpu_rcache->prev);
>>           }
>>           free_percpu(rcache->cpu_rcaches);
>> -        for (j = 0; j < rcache->depot_size; ++j)
>> -            iova_magazine_free(rcache->depot[j]);
>> +        while ((mag = iova_depot_pop(rcache)))
>> +            iova_magazine_free(mag);
>>       }
>>       kfree(iovad->rcaches);
>> @@ -941,17 +949,16 @@ static void free_cpu_cached_iovas(unsigned int
>> cpu, struct iova_domain *iovad)
>>   static void free_global_cached_iovas(struct iova_domain *iovad)
>>   {
>>       struct iova_rcache *rcache;
>> +    struct iova_magazine *mag;
>>       unsigned long flags;
>> -    int i, j;
>> -    for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>> +    for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>>           rcache = &iovad->rcaches[i];
>>           spin_lock_irqsave(&rcache->lock, flags);
>> -        for (j = 0; j < rcache->depot_size; ++j) {
>> -            iova_magazine_free_pfns(rcache->depot[j], iovad);
>> -            iova_magazine_free(rcache->depot[j]);
>> +        while ((mag = iova_depot_pop(rcache))) {
>> +            iova_magazine_free_pfns(mag, iovad);
>> +            iova_magazine_free(mag);
>>           }
>> -        rcache->depot_size = 0;
>>           spin_unlock_irqrestore(&rcache->lock, flags);
>>       }
>>   }
>