v2: https://lore.kernel.org/linux-iommu/[email protected]/
Hi all,
I hope this is good to go now, just fixed the locking (and threw
lockdep at it to confirm, which of course I should have done to begin
with...) and picked up tags.
Cheers,
Robin.
Robin Murphy (2):
iommu/iova: Make the rcache depot scale better
iommu/iova: Manage the depot list size
drivers/iommu/iova.c | 95 ++++++++++++++++++++++++++++++--------------
1 file changed, 65 insertions(+), 30 deletions(-)
--
2.39.2.101.g768bb238c484.dirty
Automatically scaling the depot up to suit the peak capacity of a
workload is all well and good, but it would be nice to have a way to
scale it back down again if the workload changes. To that end, add
backround reclaim that will gradually free surplus magazines if the
depot size remains above a reasonable threshold for long enough.
Reviewed-by: Jerry Snitselaar <[email protected]>
Signed-off-by: Robin Murphy <[email protected]>
---
v3: Make sure iova_depot_work_func() locking is IRQ-safe
drivers/iommu/iova.c | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index dd2309e9a6c5..d30e453d0fb4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -11,6 +11,7 @@
#include <linux/smp.h>
#include <linux/bitops.h>
#include <linux/cpu.h>
+#include <linux/workqueue.h>
/* The anchor node sits above the top of the usable address space */
#define IOVA_ANCHOR ~0UL
@@ -627,6 +628,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
*/
#define IOVA_MAG_SIZE 127
+#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
+
struct iova_magazine {
union {
unsigned long size;
@@ -644,8 +647,11 @@ struct iova_cpu_rcache {
struct iova_rcache {
spinlock_t lock;
+ unsigned int depot_size;
struct iova_magazine *depot;
struct iova_cpu_rcache __percpu *cpu_rcaches;
+ struct iova_domain *iovad;
+ struct delayed_work work;
};
static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
@@ -726,6 +732,7 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
rcache->depot = mag->next;
mag->size = IOVA_MAG_SIZE;
+ rcache->depot_size--;
return mag;
}
@@ -733,6 +740,25 @@ static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *ma
{
mag->next = rcache->depot;
rcache->depot = mag;
+ rcache->depot_size++;
+}
+
+static void iova_depot_work_func(struct work_struct *work)
+{
+ struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work);
+ struct iova_magazine *mag = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rcache->lock, flags);
+ if (rcache->depot_size > num_online_cpus())
+ mag = iova_depot_pop(rcache);
+ spin_unlock_irqrestore(&rcache->lock, flags);
+
+ if (mag) {
+ iova_magazine_free_pfns(mag, rcache->iovad);
+ iova_magazine_free(mag);
+ schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
+ }
}
int iova_domain_init_rcaches(struct iova_domain *iovad)
@@ -752,6 +778,8 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
rcache = &iovad->rcaches[i];
spin_lock_init(&rcache->lock);
+ rcache->iovad = iovad;
+ INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func);
rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
cache_line_size());
if (!rcache->cpu_rcaches) {
@@ -812,6 +840,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
spin_lock(&rcache->lock);
iova_depot_push(rcache, cpu_rcache->loaded);
spin_unlock(&rcache->lock);
+ schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
cpu_rcache->loaded = new_mag;
can_insert = true;
@@ -912,6 +941,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
iova_magazine_free(cpu_rcache->prev);
}
free_percpu(rcache->cpu_rcaches);
+ cancel_delayed_work_sync(&rcache->work);
while (rcache->depot)
iova_magazine_free(iova_depot_pop(rcache));
}
--
2.39.2.101.g768bb238c484.dirty
The algorithm in the original paper specifies the storage of full
magazines in the depot as an unbounded list rather than a fixed-size
array. It turns out to be pretty straightforward to do this in our
implementation with no significant loss of efficiency. This allows
the depot to scale up to the working set sizes of larger systems,
while also potentially saving some memory on smaller ones too.
Since this involves touching struct iova_magazine with the requisite
care, we may as well reinforce the comment with a proper assertion too.
Reviewed-by: John Garry <[email protected]>
Reviewed-by: Jerry Snitselaar <[email protected]>
Signed-off-by: Robin Murphy <[email protected]>
---
v3: No change
drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
1 file changed, 35 insertions(+), 30 deletions(-)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 10b964600948..dd2309e9a6c5 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -622,15 +622,19 @@ EXPORT_SYMBOL_GPL(reserve_iova);
/*
* As kmalloc's buffer size is fixed to power of 2, 127 is chosen to
* assure size of 'iova_magazine' to be 1024 bytes, so that no memory
- * will be wasted.
+ * will be wasted. Since only full magazines are inserted into the depot,
+ * we don't need to waste PFN capacity on a separate list head either.
*/
#define IOVA_MAG_SIZE 127
-#define MAX_GLOBAL_MAGS 32 /* magazines per bin */
struct iova_magazine {
- unsigned long size;
+ union {
+ unsigned long size;
+ struct iova_magazine *next;
+ };
unsigned long pfns[IOVA_MAG_SIZE];
};
+static_assert(!(sizeof(struct iova_magazine) & (sizeof(struct iova_magazine) - 1)));
struct iova_cpu_rcache {
spinlock_t lock;
@@ -640,8 +644,7 @@ struct iova_cpu_rcache {
struct iova_rcache {
spinlock_t lock;
- unsigned long depot_size;
- struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+ struct iova_magazine *depot;
struct iova_cpu_rcache __percpu *cpu_rcaches;
};
@@ -717,6 +720,21 @@ static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
mag->pfns[mag->size++] = pfn;
}
+static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
+{
+ struct iova_magazine *mag = rcache->depot;
+
+ rcache->depot = mag->next;
+ mag->size = IOVA_MAG_SIZE;
+ return mag;
+}
+
+static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *mag)
+{
+ mag->next = rcache->depot;
+ rcache->depot = mag;
+}
+
int iova_domain_init_rcaches(struct iova_domain *iovad)
{
unsigned int cpu;
@@ -734,7 +752,6 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
rcache = &iovad->rcaches[i];
spin_lock_init(&rcache->lock);
- rcache->depot_size = 0;
rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
cache_line_size());
if (!rcache->cpu_rcaches) {
@@ -776,7 +793,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
struct iova_rcache *rcache,
unsigned long iova_pfn)
{
- struct iova_magazine *mag_to_free = NULL;
struct iova_cpu_rcache *cpu_rcache;
bool can_insert = false;
unsigned long flags;
@@ -794,12 +810,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
if (new_mag) {
spin_lock(&rcache->lock);
- if (rcache->depot_size < MAX_GLOBAL_MAGS) {
- rcache->depot[rcache->depot_size++] =
- cpu_rcache->loaded;
- } else {
- mag_to_free = cpu_rcache->loaded;
- }
+ iova_depot_push(rcache, cpu_rcache->loaded);
spin_unlock(&rcache->lock);
cpu_rcache->loaded = new_mag;
@@ -812,11 +823,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
spin_unlock_irqrestore(&cpu_rcache->lock, flags);
- if (mag_to_free) {
- iova_magazine_free_pfns(mag_to_free, iovad);
- iova_magazine_free(mag_to_free);
- }
-
return can_insert;
}
@@ -854,9 +860,9 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
has_pfn = true;
} else {
spin_lock(&rcache->lock);
- if (rcache->depot_size > 0) {
+ if (rcache->depot) {
iova_magazine_free(cpu_rcache->loaded);
- cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+ cpu_rcache->loaded = iova_depot_pop(rcache);
has_pfn = true;
}
spin_unlock(&rcache->lock);
@@ -895,9 +901,8 @@ static void free_iova_rcaches(struct iova_domain *iovad)
struct iova_rcache *rcache;
struct iova_cpu_rcache *cpu_rcache;
unsigned int cpu;
- int i, j;
- for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+ for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
rcache = &iovad->rcaches[i];
if (!rcache->cpu_rcaches)
break;
@@ -907,8 +912,8 @@ static void free_iova_rcaches(struct iova_domain *iovad)
iova_magazine_free(cpu_rcache->prev);
}
free_percpu(rcache->cpu_rcaches);
- for (j = 0; j < rcache->depot_size; ++j)
- iova_magazine_free(rcache->depot[j]);
+ while (rcache->depot)
+ iova_magazine_free(iova_depot_pop(rcache));
}
kfree(iovad->rcaches);
@@ -942,16 +947,16 @@ static void free_global_cached_iovas(struct iova_domain *iovad)
{
struct iova_rcache *rcache;
unsigned long flags;
- int i, j;
- for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+ for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
rcache = &iovad->rcaches[i];
spin_lock_irqsave(&rcache->lock, flags);
- for (j = 0; j < rcache->depot_size; ++j) {
- iova_magazine_free_pfns(rcache->depot[j], iovad);
- iova_magazine_free(rcache->depot[j]);
+ while (rcache->depot) {
+ struct iova_magazine *mag = iova_depot_pop(rcache);
+
+ iova_magazine_free_pfns(mag, iovad);
+ iova_magazine_free(mag);
}
- rcache->depot_size = 0;
spin_unlock_irqrestore(&rcache->lock, flags);
}
}
--
2.39.2.101.g768bb238c484.dirty
On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
> Robin Murphy (2):
> iommu/iova: Make the rcache depot scale better
> iommu/iova: Manage the depot list size
Applied, thanks.
On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
> v2: https://lore.kernel.org/linux-iommu/[email protected]/
>
> Hi all,
>
> I hope this is good to go now, just fixed the locking (and threw
> lockdep at it to confirm, which of course I should have done to begin
> with...) and picked up tags.
Hi,
After pulling the v6.7 changes we started seeing the following memory
leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
which is why I didn't perform bisection. However, looking at the
mentioned code paths, they seem to have been changed in v6.7 as part of
this patchset. I reverted both patches and didn't see any memory leaks
when running a full regression (~10 hours), but I will repeat it to be
sure.
Any idea what could be the problem?
Thanks
[1]
unreferenced object 0xffff8881a5301000 (size 1024):
comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
hex dump (first 32 bytes):
00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00 .........}......
0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00 ................
backtrace:
[<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
[<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
[<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
[<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
[<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
[<ffffffff813ea16b>] __run_timers+0x78b/0xb80
[<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
[<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
unreferenced object 0xffff8881392ec000 (size 1024):
comm "softirq", pid 0, jiffies 4306326731 (age 433.359s)
hex dump (first 32 bytes):
00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00 ..0.....P.......
f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00 ................
backtrace:
[<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
[<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
[<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
[<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
[<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
[<ffffffff813ea16b>] __run_timers+0x78b/0xb80
[<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
[<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
unreferenced object 0xffff8881411f9000 (size 1024):
comm "softirq", pid 0, jiffies 4306708887 (age 51.459s)
hex dump (first 32 bytes):
00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00 ...&....,.......
ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00 ................
backtrace:
[<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
[<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
[<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
[<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
[<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
[<ffffffff813ea16b>] __run_timers+0x78b/0xb80
[<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
[<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
unreferenced object 0xffff88812be26400 (size 1024):
comm "softirq", pid 0, jiffies 4306710027 (age 50.319s)
hex dump (first 32 bytes):
00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00 ...9....2.......
e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00 ................
backtrace:
[<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
[<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
[<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
[<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
[<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
[<ffffffff813ea16b>] __run_timers+0x78b/0xb80
[<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
[<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
unreferenced object 0xffff8881261c1000 (size 1024):
comm "softirq", pid 0, jiffies 4306711547 (age 48.799s)
hex dump (first 32 bytes):
00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00 .d.+.....|......
87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00 ................
backtrace:
[<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
[<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
[<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
[<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
[<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
[<ffffffff813ea16b>] __run_timers+0x78b/0xb80
[<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
[<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
On Thu, Dec 28, 2023 at 02:23:20PM +0200, Ido Schimmel wrote:
> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
> > v2: https://lore.kernel.org/linux-iommu/[email protected]/
> >
> > Hi all,
> >
> > I hope this is good to go now, just fixed the locking (and threw
> > lockdep at it to confirm, which of course I should have done to begin
> > with...) and picked up tags.
>
> Hi,
>
> After pulling the v6.7 changes we started seeing the following memory
> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
> which is why I didn't perform bisection. However, looking at the
> mentioned code paths, they seem to have been changed in v6.7 as part of
> this patchset. I reverted both patches and didn't see any memory leaks
> when running a full regression (~10 hours), but I will repeat it to be
> sure.
FYI, we didn't see the leaks since reverting these two patches whereas
before we saw them almost everyday, so I'm quite sure they introduced
the leaks.
Thanks
On Tue, Jan 02, 2024 at 09:24:37AM +0200, Ido Schimmel wrote:
> FYI, we didn't see the leaks since reverting these two patches whereas
> before we saw them almost everyday, so I'm quite sure they introduced
> the leaks.
Robin, can you have a look please?
Thanks,
Joerg
On 12/28/2023 8:23 PM, Ido Schimmel wrote:
> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>> v2: https://lore.kernel.org/linux-iommu/[email protected]/
>>
>> Hi all,
>>
>> I hope this is good to go now, just fixed the locking (and threw
>> lockdep at it to confirm, which of course I should have done to begin
>> with...) and picked up tags.
> Hi,
>
> After pulling the v6.7 changes we started seeing the following memory
> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
> which is why I didn't perform bisection. However, looking at the
> mentioned code paths, they seem to have been changed in v6.7 as part of
> this patchset. I reverted both patches and didn't see any memory leaks
> when running a full regression (~10 hours), but I will repeat it to be
> sure.
>
> Any idea what could be the problem?
>
> Thanks
>
> [1]
> unreferenced object 0xffff8881a5301000 (size 1024):
size 1024, seems one "iova_magazine" was leaked ?
no other struct happens to be size 1024.
Thanks,
Ethan
> comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
> hex dump (first 32 bytes):
> 00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00 .........}......
> 0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881392ec000 (size 1024):
> comm "softirq", pid 0, jiffies 4306326731 (age 433.359s)
> hex dump (first 32 bytes):
> 00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00 ..0.....P.......
> f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881411f9000 (size 1024):
> comm "softirq", pid 0, jiffies 4306708887 (age 51.459s)
> hex dump (first 32 bytes):
> 00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00 ...&....,.......
> ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff88812be26400 (size 1024):
> comm "softirq", pid 0, jiffies 4306710027 (age 50.319s)
> hex dump (first 32 bytes):
> 00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00 ...9....2.......
> e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881261c1000 (size 1024):
> comm "softirq", pid 0, jiffies 4306711547 (age 48.799s)
> hex dump (first 32 bytes):
> 00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00 .d.+.....|......
> 87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
On 1/2/2024 3:24 PM, Ido Schimmel wrote:
> On Thu, Dec 28, 2023 at 02:23:20PM +0200, Ido Schimmel wrote:
>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>> v2: https://lore.kernel.org/linux-iommu/[email protected]/
>>>
>>> Hi all,
>>>
>>> I hope this is good to go now, just fixed the locking (and threw
>>> lockdep at it to confirm, which of course I should have done to begin
>>> with...) and picked up tags.
>> Hi,
>>
>> After pulling the v6.7 changes we started seeing the following memory
>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>> which is why I didn't perform bisection. However, looking at the
>> mentioned code paths, they seem to have been changed in v6.7 as part of
>> this patchset. I reverted both patches and didn't see any memory leaks
>> when running a full regression (~10 hours), but I will repeat it to be
>> sure.
> FYI, we didn't see the leaks since reverting these two patches whereas
> before we saw them almost everyday, so I'm quite sure they introduced
> the leaks.
Seems some magazines were not freed when one CPU is dead (hot unplugged) ?
static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain
*iovad)
{
struct iova_cpu_rcache *cpu_rcache;
struct iova_rcache *rcache;
unsigned long flags;
int i;
for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
rcache = &iovad->rcaches[i];
cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
spin_lock_irqsave(&cpu_rcache->lock, flags);
iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+ iova_magazine_free(cpu_rcache->loaded);
iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+ iova_magazine_free(cpu_rcache->prev);
spin_unlock_irqrestore(&cpu_rcache->lock, flags);
}
}
Thanks,
Ethan
> Thanks
>
在 2024/1/6 12:21, Ethan Zhao 写道:
>
> On 1/2/2024 3:24 PM, Ido Schimmel wrote:
>> On Thu, Dec 28, 2023 at 02:23:20PM +0200, Ido Schimmel wrote:
>>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>>> v2:
>>>> https://lore.kernel.org/linux-iommu/[email protected]/
>>>>
>>>> Hi all,
>>>>
>>>> I hope this is good to go now, just fixed the locking (and threw
>>>> lockdep at it to confirm, which of course I should have done to begin
>>>> with...) and picked up tags.
>>> Hi,
>>>
>>> After pulling the v6.7 changes we started seeing the following memory
>>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>>> which is why I didn't perform bisection. However, looking at the
>>> mentioned code paths, they seem to have been changed in v6.7 as part of
>>> this patchset. I reverted both patches and didn't see any memory leaks
>>> when running a full regression (~10 hours), but I will repeat it to be
>>> sure.
>> FYI, we didn't see the leaks since reverting these two patches whereas
>> before we saw them almost everyday, so I'm quite sure they introduced
>> the leaks.
>
> Seems some magazines were not freed when one CPU is dead (hot
> unplugged) ?
>
> static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain
> *iovad)
> {
> struct iova_cpu_rcache *cpu_rcache;
> struct iova_rcache *rcache;
> unsigned long flags;
> int i;
>
> for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
> rcache = &iovad->rcaches[i];
> cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
> spin_lock_irqsave(&cpu_rcache->lock, flags);
> iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
>
> + iova_magazine_free(cpu_rcache->loaded);
>
> iova_magazine_free_pfns(cpu_rcache->prev, iovad);
>
> + iova_magazine_free(cpu_rcache->prev);
>
> spin_unlock_irqrestore(&cpu_rcache->lock, flags);
> }
> }
It seems cpu_rcache->loaded and cpu_rcache->prev will be freed in
free_iova_rcaches(), and it should not cause memory leak because
iova_magazine_free() will be called for each possible cpu.
free_cpu_cached_iovas() is used to free cached iovas in magazines.
Thanks,
Zekun
On 1/6/2024 3:07 PM, zhangzekun (A) wrote:
>
>
> 在 2024/1/6 12:21, Ethan Zhao 写道:
>>
>> On 1/2/2024 3:24 PM, Ido Schimmel wrote:
>>> On Thu, Dec 28, 2023 at 02:23:20PM +0200, Ido Schimmel wrote:
>>>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>>>> v2:
>>>>> https://lore.kernel.org/linux-iommu/[email protected]/
>>>>>
>>>>> Hi all,
>>>>>
>>>>> I hope this is good to go now, just fixed the locking (and threw
>>>>> lockdep at it to confirm, which of course I should have done to begin
>>>>> with...) and picked up tags.
>>>> Hi,
>>>>
>>>> After pulling the v6.7 changes we started seeing the following memory
>>>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>>>> which is why I didn't perform bisection. However, looking at the
>>>> mentioned code paths, they seem to have been changed in v6.7 as
>>>> part of
>>>> this patchset. I reverted both patches and didn't see any memory leaks
>>>> when running a full regression (~10 hours), but I will repeat it to be
>>>> sure.
>>> FYI, we didn't see the leaks since reverting these two patches whereas
>>> before we saw them almost everyday, so I'm quite sure they introduced
>>> the leaks.
>>
>> Seems some magazines were not freed when one CPU is dead (hot
>> unplugged) ?
>>
>> static void free_cpu_cached_iovas(unsigned int cpu, struct
>> iova_domain *iovad)
>> {
>> struct iova_cpu_rcache *cpu_rcache;
>> struct iova_rcache *rcache;
>> unsigned long flags;
>> int i;
>>
>> for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>> rcache = &iovad->rcaches[i];
>> cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
>> spin_lock_irqsave(&cpu_rcache->lock, flags);
>> iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
>>
>> + iova_magazine_free(cpu_rcache->loaded);
>>
>> iova_magazine_free_pfns(cpu_rcache->prev, iovad);
>>
>> + iova_magazine_free(cpu_rcache->prev);
>>
>> spin_unlock_irqrestore(&cpu_rcache->lock, flags);
>> }
>> }
> It seems cpu_rcache->loaded and cpu_rcache->prev will be freed in
> free_iova_rcaches(), and it should not cause memory leak because
> iova_magazine_free() will be called for each possible cpu.
> free_cpu_cached_iovas() is used to free cached iovas in magazines.
Yup, looked closely. possible cpu, not online cpu.
Thanks,
Ethan
>
> Thanks,
> Zekun
>
>
>
On 12/28/2023 8:23 PM, Ido Schimmel wrote:
> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>> v2: https://lore.kernel.org/linux-iommu/[email protected]/
>>
>> Hi all,
>>
>> I hope this is good to go now, just fixed the locking (and threw
>> lockdep at it to confirm, which of course I should have done to begin
>> with...) and picked up tags.
> Hi,
>
> After pulling the v6.7 changes we started seeing the following memory
> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
> which is why I didn't perform bisection. However, looking at the
> mentioned code paths, they seem to have been changed in v6.7 as part of
> this patchset. I reverted both patches and didn't see any memory leaks
> when running a full regression (~10 hours), but I will repeat it to be
> sure.
>
> Any idea what could be the problem?
>
> Thanks
>
> [1]
> unreferenced object 0xffff8881a5301000 (size 1024):
> comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
> hex dump (first 32 bytes):
> 00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00 .........}......
Here the magazine is empty & loaded, size is 0.
> 0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881392ec000 (size 1024):
> comm "softirq", pid 0, jiffies 4306326731 (age 433.359s)
> hex dump (first 32 bytes):
> 00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00 ..0.....P.......
The above *unreferenced object 0xffff8881a5301000* is referenced here,
00 10 30 a5 81 88 ff ff -> ff ff 88 81 a5 30 10 00
> f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881411f9000 (size 1024):
> comm "softirq", pid 0, jiffies 4306708887 (age 51.459s)
> hex dump (first 32 bytes):
> 00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00 ...&....,.......
> ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff88812be26400 (size 1024):
> comm "softirq", pid 0, jiffies 4306710027 (age 50.319s)
> hex dump (first 32 bytes):
> 00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00 ...9....2.......
> e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881261c1000 (size 1024):
> comm "softirq", pid 0, jiffies 4306711547 (age 48.799s)
> hex dump (first 32 bytes):
> 00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00 .d.+.....|......
The value of first 8 bytes is : ff ff 88 81 2b e2 64 00 (little endian)
this is the address of above object 0xffff88812be26400
so the struct is exactly the
struct iova_magazine { union { unsigned long size; struct iova_magazine
*next; }; unsigned long pfns[IOVA_MAG_SIZE]; };
when the magazine is stored in depot, the member *next is used to
pointer the next magazine.
But when the magzine is loaded, the size member is used,
This report should be *false positive* by kmemleak.
Thanks,
Ethan
> 87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
On 2023-12-28 12:23 pm, Ido Schimmel wrote:
> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>> v2: https://lore.kernel.org/linux-iommu/[email protected]/
>>
>> Hi all,
>>
>> I hope this is good to go now, just fixed the locking (and threw
>> lockdep at it to confirm, which of course I should have done to begin
>> with...) and picked up tags.
>
> Hi,
>
> After pulling the v6.7 changes we started seeing the following memory
> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
> which is why I didn't perform bisection. However, looking at the
> mentioned code paths, they seem to have been changed in v6.7 as part of
> this patchset. I reverted both patches and didn't see any memory leaks
> when running a full regression (~10 hours), but I will repeat it to be
> sure.
>
> Any idea what could be the problem?
Hmm, we've got what looks to be a set of magazines forming a plausible
depot list (or at least the tail end of one):
ffff8881411f9000 -> ffff8881261c1000
ffff8881261c1000 -> ffff88812be26400
ffff88812be26400 -> ffff8188392ec000
ffff8188392ec000 -> ffff8881a5301000
ffff8881a5301000 -> NULL
which I guess has somehow become detached from its rcache->depot without
being freed properly? However I'm struggling to see any conceivable way
that could happen which wouldn't already be more severely broken in
other ways as well (i.e. either general memory corruption or someone
somehow still trying to use the IOVA domain while it's being torn down).
Out of curiosity, does reverting just patch #2 alone make a difference?
And is your workload doing anything "interesting" in relation to IOVA
domain lifetimes, like creating and destroying SR-IOV virtual functions,
changing IOMMU domain types via sysfs, or using that horrible vdpa
thing, or are you seeing this purely from regular driver DMA API usage?
Thanks,
Robin.
>
> Thanks
>
> [1]
> unreferenced object 0xffff8881a5301000 (size 1024):
> comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
> hex dump (first 32 bytes):
> 00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00 .........}......
> 0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881392ec000 (size 1024):
> comm "softirq", pid 0, jiffies 4306326731 (age 433.359s)
> hex dump (first 32 bytes):
> 00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00 ..0.....P.......
> f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881411f9000 (size 1024):
> comm "softirq", pid 0, jiffies 4306708887 (age 51.459s)
> hex dump (first 32 bytes):
> 00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00 ...&....,.......
> ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff88812be26400 (size 1024):
> comm "softirq", pid 0, jiffies 4306710027 (age 50.319s)
> hex dump (first 32 bytes):
> 00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00 ...9....2.......
> e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881261c1000 (size 1024):
> comm "softirq", pid 0, jiffies 4306711547 (age 48.799s)
> hex dump (first 32 bytes):
> 00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00 .d.+.....|......
> 87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00 ................
> backtrace:
> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
On 1/9/2024 1:35 AM, Robin Murphy wrote:
> On 2023-12-28 12:23 pm, Ido Schimmel wrote:
>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>> v2:
>>> https://lore.kernel.org/linux-iommu/[email protected]/
>>>
>>> Hi all,
>>>
>>> I hope this is good to go now, just fixed the locking (and threw
>>> lockdep at it to confirm, which of course I should have done to begin
>>> with...) and picked up tags.
>>
>> Hi,
>>
>> After pulling the v6.7 changes we started seeing the following memory
>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>> which is why I didn't perform bisection. However, looking at the
>> mentioned code paths, they seem to have been changed in v6.7 as part of
>> this patchset. I reverted both patches and didn't see any memory leaks
>> when running a full regression (~10 hours), but I will repeat it to be
>> sure.
>>
>> Any idea what could be the problem?
>
> Hmm, we've got what looks to be a set of magazines forming a plausible
> depot list (or at least the tail end of one):
>
> ffff8881411f9000 -> ffff8881261c1000
>
> ffff8881261c1000 -> ffff88812be26400
>
> ffff88812be26400 -> ffff8188392ec000
>
> ffff8188392ec000 -> ffff8881a5301000
>
> ffff8881a5301000 -> NULL
>
> which I guess has somehow become detached from its rcache->depot
> without being freed properly? However I'm struggling to see any
> conceivable way that could happen which wouldn't already be more
> severely broken in other ways as well (i.e. either general memory
> corruption or someone somehow still trying to use the IOVA domain
> while it's being torn down).
>
> Out of curiosity, does reverting just patch #2 alone make a
> difference? And is your workload doing anything "interesting" in
> relation to IOVA domain lifetimes, like creating and destroying SR-IOV
> virtual functions, changing IOMMU domain types via sysfs, or using
> that horrible vdpa thing, or are you seeing this purely from regular
> driver DMA API usage?
There no lock held when free_iova_rcaches(), is it possible
free_iova_rcaches() race with the delayed iova_depot_work_func() ?
I don't know why not call cancel_delayed_work_sync(&rcache->work); first
in free_iova_rcaches() to avoid possible race.
Thanks,
Ethan
>
> Thanks,
> Robin.
>
>>
>> Thanks
>>
>> [1]
>> unreferenced object 0xffff8881a5301000 (size 1024):
>> comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
>> hex dump (first 32 bytes):
>> 00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00 .........}......
>> 0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00 ................
>> backtrace:
>> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>
>> unreferenced object 0xffff8881392ec000 (size 1024):
>> comm "softirq", pid 0, jiffies 4306326731 (age 433.359s)
>> hex dump (first 32 bytes):
>> 00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00 ..0.....P.......
>> f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00 ................
>> backtrace:
>> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>
>> unreferenced object 0xffff8881411f9000 (size 1024):
>> comm "softirq", pid 0, jiffies 4306708887 (age 51.459s)
>> hex dump (first 32 bytes):
>> 00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00 ...&....,.......
>> ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00 ................
>> backtrace:
>> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>
>> unreferenced object 0xffff88812be26400 (size 1024):
>> comm "softirq", pid 0, jiffies 4306710027 (age 50.319s)
>> hex dump (first 32 bytes):
>> 00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00 ...9....2.......
>> e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00 ................
>> backtrace:
>> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>
>> unreferenced object 0xffff8881261c1000 (size 1024):
>> comm "softirq", pid 0, jiffies 4306711547 (age 48.799s)
>> hex dump (first 32 bytes):
>> 00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00 .d.+.....|......
>> 87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00 ................
>> backtrace:
>> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
On 1/9/2024 1:54 PM, Ethan Zhao wrote:
>
> On 1/9/2024 1:35 AM, Robin Murphy wrote:
>> On 2023-12-28 12:23 pm, Ido Schimmel wrote:
>>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>>> v2:
>>>> https://lore.kernel.org/linux-iommu/[email protected]/
>>>>
>>>> Hi all,
>>>>
>>>> I hope this is good to go now, just fixed the locking (and threw
>>>> lockdep at it to confirm, which of course I should have done to begin
>>>> with...) and picked up tags.
>>>
>>> Hi,
>>>
>>> After pulling the v6.7 changes we started seeing the following memory
>>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>>> which is why I didn't perform bisection. However, looking at the
>>> mentioned code paths, they seem to have been changed in v6.7 as part of
>>> this patchset. I reverted both patches and didn't see any memory leaks
>>> when running a full regression (~10 hours), but I will repeat it to be
>>> sure.
>>>
>>> Any idea what could be the problem?
>>
>> Hmm, we've got what looks to be a set of magazines forming a
>> plausible depot list (or at least the tail end of one):
>>
>> ffff8881411f9000 -> ffff8881261c1000
>>
>> ffff8881261c1000 -> ffff88812be26400
>>
>> ffff88812be26400 -> ffff8188392ec000
>>
>> ffff8188392ec000 -> ffff8881a5301000
>>
>> ffff8881a5301000 -> NULL
>>
>> which I guess has somehow become detached from its rcache->depot
>> without being freed properly? However I'm struggling to see any
>> conceivable way that could happen which wouldn't already be more
>> severely broken in other ways as well (i.e. either general memory
>> corruption or someone somehow still trying to use the IOVA domain
>> while it's being torn down).
>>
>> Out of curiosity, does reverting just patch #2 alone make a
>> difference? And is your workload doing anything "interesting" in
>> relation to IOVA domain lifetimes, like creating and destroying
>> SR-IOV virtual functions, changing IOMMU domain types via sysfs, or
>> using that horrible vdpa thing, or are you seeing this purely from
>> regular driver DMA API usage?
>
> There no lock held when free_iova_rcaches(), is it possible
> free_iova_rcaches() race with the delayed cancel_delayed_work_sync() ?
>
> I don't know why not call cancel_delayed_work_sync(&rcache->work);
> first in free_iova_rcaches() to avoid possible race.
>
between following functions pair, race possible ? if called cocurrently.
1. free_iova_rcaches() with iova_depot_work_func()
free_iova_rcaches() holds no lock, iova_depot_work_func() holds
rcache->lock.
2. iova_cpuhp_dead() with iova_depot_work_func()
iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock,
iova_depot_work_func() holds rcache->lock.
3. iova_cpuhp_dead() with free_iova_rcaches()
iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock,
free_iova_rcaches() holds no lock.
4. iova_cpuhp_dead() with free_global_cached_iovas()
iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock and
free_global_cached_iovas() holds rcache->lock.
.....
Thanks,
Ethan
>
> Thanks,
>
> Ethan
>
>>
>> Thanks,
>> Robin.
>>
>>>
>>> Thanks
>>>
>>> [1]
>>> unreferenced object 0xffff8881a5301000 (size 1024):
>>> comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
>>> hex dump (first 32 bytes):
>>> 00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00 .........}......
>>> 0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00 ................
>>> backtrace:
>>> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>>
>>> unreferenced object 0xffff8881392ec000 (size 1024):
>>> comm "softirq", pid 0, jiffies 4306326731 (age 433.359s)
>>> hex dump (first 32 bytes):
>>> 00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00 ..0.....P.......
>>> f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00 ................
>>> backtrace:
>>> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>>
>>> unreferenced object 0xffff8881411f9000 (size 1024):
>>> comm "softirq", pid 0, jiffies 4306708887 (age 51.459s)
>>> hex dump (first 32 bytes):
>>> 00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00 ...&....,.......
>>> ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00 ................
>>> backtrace:
>>> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>>
>>> unreferenced object 0xffff88812be26400 (size 1024):
>>> comm "softirq", pid 0, jiffies 4306710027 (age 50.319s)
>>> hex dump (first 32 bytes):
>>> 00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00 ...9....2.......
>>> e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00 ................
>>> backtrace:
>>> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>>
>>> unreferenced object 0xffff8881261c1000 (size 1024):
>>> comm "softirq", pid 0, jiffies 4306711547 (age 48.799s)
>>> hex dump (first 32 bytes):
>>> 00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00 .d.+.....|......
>>> 87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00 ................
>>> backtrace:
>>> [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>> [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>> [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>> [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>> [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>> [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>> [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>> [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>> [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>
>
On 2024-01-09 6:23 am, Ethan Zhao wrote:
>
> On 1/9/2024 1:54 PM, Ethan Zhao wrote:
>>
>> On 1/9/2024 1:35 AM, Robin Murphy wrote:
>>> On 2023-12-28 12:23 pm, Ido Schimmel wrote:
>>>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>>>> v2:
>>>>> https://lore.kernel.org/linux-iommu/[email protected]/
>>>>>
>>>>> Hi all,
>>>>>
>>>>> I hope this is good to go now, just fixed the locking (and threw
>>>>> lockdep at it to confirm, which of course I should have done to begin
>>>>> with...) and picked up tags.
>>>>
>>>> Hi,
>>>>
>>>> After pulling the v6.7 changes we started seeing the following memory
>>>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>>>> which is why I didn't perform bisection. However, looking at the
>>>> mentioned code paths, they seem to have been changed in v6.7 as part of
>>>> this patchset. I reverted both patches and didn't see any memory leaks
>>>> when running a full regression (~10 hours), but I will repeat it to be
>>>> sure.
>>>>
>>>> Any idea what could be the problem?
>>>
>>> Hmm, we've got what looks to be a set of magazines forming a
>>> plausible depot list (or at least the tail end of one):
>>>
>>> ffff8881411f9000 -> ffff8881261c1000
>>>
>>> ffff8881261c1000 -> ffff88812be26400
>>>
>>> ffff88812be26400 -> ffff8188392ec000
>>>
>>> ffff8188392ec000 -> ffff8881a5301000
>>>
>>> ffff8881a5301000 -> NULL
>>>
>>> which I guess has somehow become detached from its rcache->depot
>>> without being freed properly? However I'm struggling to see any
>>> conceivable way that could happen which wouldn't already be more
>>> severely broken in other ways as well (i.e. either general memory
>>> corruption or someone somehow still trying to use the IOVA domain
>>> while it's being torn down).
>>>
>>> Out of curiosity, does reverting just patch #2 alone make a
>>> difference? And is your workload doing anything "interesting" in
>>> relation to IOVA domain lifetimes, like creating and destroying
>>> SR-IOV virtual functions, changing IOMMU domain types via sysfs, or
>>> using that horrible vdpa thing, or are you seeing this purely from
>>> regular driver DMA API usage?
>>
>> There no lock held when free_iova_rcaches(), is it possible
>> free_iova_rcaches() race with the delayed cancel_delayed_work_sync() ?
>>
>> I don't know why not call cancel_delayed_work_sync(&rcache->work);
>> first in free_iova_rcaches() to avoid possible race.
>>
> between following functions pair, race possible ? if called cocurrently.
>
> 1. free_iova_rcaches() with iova_depot_work_func()
>
> free_iova_rcaches() holds no lock, iova_depot_work_func() holds
> rcache->lock.
Unless I've completely misunderstood the workqueue API, that can't
happen, since free_iova_rcaches() *does* synchronously cancel the work
before it starts freeing the depot list.
> 2. iova_cpuhp_dead() with iova_depot_work_func()
>
> iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock,
> iova_depot_work_func() holds rcache->lock.
That's not a race because those are touching completely different things
- the closest they come to interacting is where they both free IOVAs
back to the rbtree.
> 3. iova_cpuhp_dead() with free_iova_rcaches()
>
> iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock,
> free_iova_rcaches() holds no lock.
See iova_domain_free_rcaches() - by the time we call
free_iova_rcaches(), the hotplug handler has already been removed (and
either way it couldn't account for *this* issue since it doesn't touch
the depot at all).
> 4. iova_cpuhp_dead() with free_global_cached_iovas()
>
> iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock and
> free_global_cached_iovas() holds rcache->lock.
Again, they hold different locks because they're touching unrelated things.
Thanks,
Robin.
Hi Robin,
Thanks for the reply.
On Mon, Jan 08, 2024 at 05:35:26PM +0000, Robin Murphy wrote:
> Hmm, we've got what looks to be a set of magazines forming a plausible depot
> list (or at least the tail end of one):
>
> ffff8881411f9000 -> ffff8881261c1000
>
> ffff8881261c1000 -> ffff88812be26400
>
> ffff88812be26400 -> ffff8188392ec000
>
> ffff8188392ec000 -> ffff8881a5301000
>
> ffff8881a5301000 -> NULL
>
> which I guess has somehow become detached from its rcache->depot without
> being freed properly? However I'm struggling to see any conceivable way that
> could happen which wouldn't already be more severely broken in other ways as
> well (i.e. either general memory corruption or someone somehow still trying
> to use the IOVA domain while it's being torn down).
The machine is running a debug kernel that among other things has KASAN
enabled, but there are no traces in the kernel log so there is no memory
corruption that I'm aware of.
> Out of curiosity, does reverting just patch #2 alone make a difference?
Will try and let you know.
> And is your workload doing anything "interesting" in relation to IOVA
> domain lifetimes, like creating and destroying SR-IOV virtual
> functions, changing IOMMU domain types via sysfs, or using that
> horrible vdpa thing, or are you seeing this purely from regular driver
> DMA API usage?
The machine is running networking related tests, but it is not using
SR-IOV, VMs or VDPA so there shouldn't be anything "interesting" as far
as IOMMU is concerned.
The two networking drivers on the machine are "igb" for the management
port and "mlxsw" for the data ports (the machine is a physical switch).
I believe the DMA API usage in the latter is quite basic and I don't
recall any DMA related problems with this driver since it was first
accepted upstream in 2015.
Thanks
On 1/9/2024 7:26 PM, Robin Murphy wrote:
> On 2024-01-09 6:23 am, Ethan Zhao wrote:
>>
>> On 1/9/2024 1:54 PM, Ethan Zhao wrote:
>>>
>>> On 1/9/2024 1:35 AM, Robin Murphy wrote:
>>>> On 2023-12-28 12:23 pm, Ido Schimmel wrote:
>>>>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>>>>> v2:
>>>>>> https://lore.kernel.org/linux-iommu/[email protected]/
>>>>>>
>>>>>> Hi all,
>>>>>>
>>>>>> I hope this is good to go now, just fixed the locking (and threw
>>>>>> lockdep at it to confirm, which of course I should have done to
>>>>>> begin
>>>>>> with...) and picked up tags.
>>>>>
>>>>> Hi,
>>>>>
>>>>> After pulling the v6.7 changes we started seeing the following memory
>>>>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce
>>>>> it,
>>>>> which is why I didn't perform bisection. However, looking at the
>>>>> mentioned code paths, they seem to have been changed in v6.7 as
>>>>> part of
>>>>> this patchset. I reverted both patches and didn't see any memory
>>>>> leaks
>>>>> when running a full regression (~10 hours), but I will repeat it
>>>>> to be
>>>>> sure.
>>>>>
>>>>> Any idea what could be the problem?
>>>>
>>>> Hmm, we've got what looks to be a set of magazines forming a
>>>> plausible depot list (or at least the tail end of one):
>>>>
>>>> ffff8881411f9000 -> ffff8881261c1000
>>>>
>>>> ffff8881261c1000 -> ffff88812be26400
>>>>
>>>> ffff88812be26400 -> ffff8188392ec000
>>>>
>>>> ffff8188392ec000 -> ffff8881a5301000
>>>>
>>>> ffff8881a5301000 -> NULL
>>>>
>>>> which I guess has somehow become detached from its rcache->depot
>>>> without being freed properly? However I'm struggling to see any
>>>> conceivable way that could happen which wouldn't already be more
>>>> severely broken in other ways as well (i.e. either general memory
>>>> corruption or someone somehow still trying to use the IOVA domain
>>>> while it's being torn down).
>>>>
>>>> Out of curiosity, does reverting just patch #2 alone make a
>>>> difference? And is your workload doing anything "interesting" in
>>>> relation to IOVA domain lifetimes, like creating and destroying
>>>> SR-IOV virtual functions, changing IOMMU domain types via sysfs, or
>>>> using that horrible vdpa thing, or are you seeing this purely from
>>>> regular driver DMA API usage?
>>>
>>> There no lock held when free_iova_rcaches(), is it possible
>>> free_iova_rcaches() race with the delayed cancel_delayed_work_sync() ?
>>>
>>> I don't know why not call cancel_delayed_work_sync(&rcache->work);
>>> first in free_iova_rcaches() to avoid possible race.
>>>
>> between following functions pair, race possible ? if called cocurrently.
>>
>> 1. free_iova_rcaches() with iova_depot_work_func()
>>
>> free_iova_rcaches() holds no lock, iova_depot_work_func() holds
>> rcache->lock.
>
> Unless I've completely misunderstood the workqueue API, that can't
> happen, since free_iova_rcaches() *does* synchronously cancel the work
> before it starts freeing the depot list.
iova_depot_work_func() pop and free mag from depot. free_iova_rcaches()
frees loaded and previous mag before syncronously cancelled.
different thing. okay here.
>
>> 2. iova_cpuhp_dead() with iova_depot_work_func()
>>
>> iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock,
>> iova_depot_work_func() holds rcache->lock.
>
> That's not a race because those are touching completely different
> things - the closest they come to interacting is where they both free
> IOVAs back to the rbtree.
iova_cpuhp_dead() free pages with
iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
iova_depot_work_func() free mag from depot. iova_magazine_free_pfns()
hold rbtree lock.
Okay, different thing.
>
>> 3. iova_cpuhp_dead() with free_iova_rcaches()
>>
>> iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock,
>> free_iova_rcaches() holds no lock.
>
> See iova_domain_free_rcaches() - by the time we call
> free_iova_rcaches(), the hotplug handler has already been removed (and
> either way it couldn't account for *this* issue since it doesn't touch
> the depot at all).
Yes, iova_cpuhp_dead() was removed before free_iova_rcaches().
>
>> 4. iova_cpuhp_dead() with free_global_cached_iovas()
>>
>> iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock and
>> free_global_cached_iovas() holds rcache->lock.
>
> Again, they hold different locks because they're touching unrelated
> things.
iova_cpuhp_dead() free loaded and previous pages.
free_global_cached_iovas() free mags from depot.
Okay too.
then free_global_cached_iovas() with iova_depot_work_func() ? they all
hold rcache->lock.
So there is no race at all, perfect. out of imagination, that memory
leak report.
kmemleak not always right.
Thanks,
Ethan
>
> Thanks,
> Robin.
On 2024-01-09 5:21 pm, Ido Schimmel wrote:
> Hi Robin,
>
> Thanks for the reply.
>
> On Mon, Jan 08, 2024 at 05:35:26PM +0000, Robin Murphy wrote:
>> Hmm, we've got what looks to be a set of magazines forming a plausible depot
>> list (or at least the tail end of one):
>>
>> ffff8881411f9000 -> ffff8881261c1000
>>
>> ffff8881261c1000 -> ffff88812be26400
>>
>> ffff88812be26400 -> ffff8188392ec000
>>
>> ffff8188392ec000 -> ffff8881a5301000
>>
>> ffff8881a5301000 -> NULL
>>
>> which I guess has somehow become detached from its rcache->depot without
>> being freed properly? However I'm struggling to see any conceivable way that
>> could happen which wouldn't already be more severely broken in other ways as
>> well (i.e. either general memory corruption or someone somehow still trying
>> to use the IOVA domain while it's being torn down).
>
> The machine is running a debug kernel that among other things has KASAN
> enabled, but there are no traces in the kernel log so there is no memory
> corruption that I'm aware of.
>
>> Out of curiosity, does reverting just patch #2 alone make a difference?
>
> Will try and let you know.
>
>> And is your workload doing anything "interesting" in relation to IOVA
>> domain lifetimes, like creating and destroying SR-IOV virtual
>> functions, changing IOMMU domain types via sysfs, or using that
>> horrible vdpa thing, or are you seeing this purely from regular driver
>> DMA API usage?
>
> The machine is running networking related tests, but it is not using
> SR-IOV, VMs or VDPA so there shouldn't be anything "interesting" as far
> as IOMMU is concerned.
>
> The two networking drivers on the machine are "igb" for the management
> port and "mlxsw" for the data ports (the machine is a physical switch).
> I believe the DMA API usage in the latter is quite basic and I don't
> recall any DMA related problems with this driver since it was first
> accepted upstream in 2015.
Thanks for the clarifications, that seems to rule out all the most
confusingly impossible scenarios, at least.
The best explanation I've managed to come up with is a false-positive
race dependent on the order in which kmemleak scans the relevant
objects. Say we have the list as depot -> A -> B -> C; the rcache object
is scanned and sees the pointer to magazine A, but then A is popped
*before* kmemleak scans it, such that when it is then scanned, its
"next" pointer has already been wiped, thus kmemleak never observes any
reference to B, so it appears that B and (transitively) C are "leaked".
If that is the case, then I'd expect it should be reproducible with
patch #1 alone (although patch #2 might make it slightly more likely if
the work ever does result in additional pops happening), but I'd expect
the leaked objects to be transient and not persist forever through
repeated scans (what I don't know is whether kmemleak automatically
un-leaks an object if it subsequently finds a new reference, or if it
needs manually clearing in between scans). I'm not sure if there's a
nice way to make that any better... unless maybe it might make sense to
call kmemleak_not_leak(mag->next) in iova_depot_pop() before that
reference disappears?
Thanks,
Robin.
On Wed, Jan 10, 2024 at 12:48:06PM +0000, Robin Murphy wrote:
> On 2024-01-09 5:21 pm, Ido Schimmel wrote:
> > Hi Robin,
> >
> > Thanks for the reply.
> >
> > On Mon, Jan 08, 2024 at 05:35:26PM +0000, Robin Murphy wrote:
> > > Hmm, we've got what looks to be a set of magazines forming a plausible depot
> > > list (or at least the tail end of one):
> > >
> > > ffff8881411f9000 -> ffff8881261c1000
> > >
> > > ffff8881261c1000 -> ffff88812be26400
> > >
> > > ffff88812be26400 -> ffff8188392ec000
> > >
> > > ffff8188392ec000 -> ffff8881a5301000
> > >
> > > ffff8881a5301000 -> NULL
> > >
> > > which I guess has somehow become detached from its rcache->depot without
> > > being freed properly? However I'm struggling to see any conceivable way that
> > > could happen which wouldn't already be more severely broken in other ways as
> > > well (i.e. either general memory corruption or someone somehow still trying
> > > to use the IOVA domain while it's being torn down).
> >
> > The machine is running a debug kernel that among other things has KASAN
> > enabled, but there are no traces in the kernel log so there is no memory
> > corruption that I'm aware of.
> >
> > > Out of curiosity, does reverting just patch #2 alone make a difference?
> >
> > Will try and let you know.
I can confirm that the issue reproduces when only patch #2 is reverted.
IOW, patch #1 seems to be the problem:
unreferenced object 0xffff8881a1ff3400 (size 1024):
comm "softirq", pid 0, jiffies 4296362635 (age 3540.420s)
hex dump (first 32 bytes):
00 00 00 00 00 00 00 00 67 b7 05 00 00 00 00 00 ........g.......
3f a6 05 00 00 00 00 00 93 99 05 00 00 00 00 00 ?...............
backtrace:
[<ffffffff819f7a68>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff818a3efa>] kmalloc_trace+0x2a/0x60
[<ffffffff8231f8f3>] free_iova_fast+0x293/0x460
[<ffffffff823132f0>] fq_ring_free_locked+0x1b0/0x310
[<ffffffff82314ced>] fq_flush_timeout+0x19d/0x2e0
[<ffffffff813e97da>] call_timer_fn+0x19a/0x5c0
[<ffffffff813ea38b>] __run_timers+0x78b/0xb80
[<ffffffff813ea7dd>] run_timer_softirq+0x5d/0xd0
[<ffffffff82f21605>] __do_softirq+0x205/0x8b5
unreferenced object 0xffff888165b9a800 (size 1024):
comm "softirq", pid 0, jiffies 4299383627 (age 519.460s)
hex dump (first 32 bytes):
00 34 ff a1 81 88 ff ff bd 9d 05 00 00 00 00 00 .4..............
f3 ab 05 00 00 00 00 00 37 b5 05 00 00 00 00 00 ........7.......
backtrace:
[<ffffffff819f7a68>] __kmem_cache_alloc_node+0x1e8/0x320
[<ffffffff818a3efa>] kmalloc_trace+0x2a/0x60
[<ffffffff8231f8f3>] free_iova_fast+0x293/0x460
[<ffffffff823132f0>] fq_ring_free_locked+0x1b0/0x310
[<ffffffff82314ced>] fq_flush_timeout+0x19d/0x2e0
[<ffffffff813e97da>] call_timer_fn+0x19a/0x5c0
[<ffffffff813ea38b>] __run_timers+0x78b/0xb80
[<ffffffff813ea7dd>] run_timer_softirq+0x5d/0xd0
[<ffffffff82f21605>] __do_softirq+0x205/0x8b5
> >
> > > And is your workload doing anything "interesting" in relation to IOVA
> > > domain lifetimes, like creating and destroying SR-IOV virtual
> > > functions, changing IOMMU domain types via sysfs, or using that
> > > horrible vdpa thing, or are you seeing this purely from regular driver
> > > DMA API usage?
> >
> > The machine is running networking related tests, but it is not using
> > SR-IOV, VMs or VDPA so there shouldn't be anything "interesting" as far
> > as IOMMU is concerned.
> >
> > The two networking drivers on the machine are "igb" for the management
> > port and "mlxsw" for the data ports (the machine is a physical switch).
> > I believe the DMA API usage in the latter is quite basic and I don't
> > recall any DMA related problems with this driver since it was first
> > accepted upstream in 2015.
>
> Thanks for the clarifications, that seems to rule out all the most
> confusingly impossible scenarios, at least.
>
> The best explanation I've managed to come up with is a false-positive race
> dependent on the order in which kmemleak scans the relevant objects. Say we
> have the list as depot -> A -> B -> C; the rcache object is scanned and sees
> the pointer to magazine A, but then A is popped *before* kmemleak scans it,
> such that when it is then scanned, its "next" pointer has already been
> wiped, thus kmemleak never observes any reference to B, so it appears that B
> and (transitively) C are "leaked". If that is the case, then I'd expect it
> should be reproducible with patch #1 alone (although patch #2 might make it
> slightly more likely if the work ever does result in additional pops
> happening), but I'd expect the leaked objects to be transient and not
> persist forever through repeated scans (what I don't know is whether
> kmemleak automatically un-leaks an object if it subsequently finds a new
> reference, or if it needs manually clearing in between scans). I'm not sure
> if there's a nice way to make that any better... unless maybe it might make
> sense to call kmemleak_not_leak(mag->next) in iova_depot_pop() before that
> reference disappears?
I'm not familiar with the code so I can't comment if that's the best
solution, but I will say that we've been running kmemleak as part of our
regression for years and every time we got a report it was an actual
memory leak. Therefore, in order to keep the tool reliable, I think it's
better to annotate the code to suppress false-positives rather than
ignoring it.
Please let me know if you want me to test a fix.
Thanks for looking into this!
On Wed, Jan 10, 2024 at 12:48:06PM +0000, Robin Murphy wrote:
> On 2024-01-09 5:21 pm, Ido Schimmel wrote:
> > On Mon, Jan 08, 2024 at 05:35:26PM +0000, Robin Murphy wrote:
> > > Hmm, we've got what looks to be a set of magazines forming a plausible depot
> > > list (or at least the tail end of one):
> > >
> > > ffff8881411f9000 -> ffff8881261c1000
> > >
> > > ffff8881261c1000 -> ffff88812be26400
> > >
> > > ffff88812be26400 -> ffff8188392ec000
> > >
> > > ffff8188392ec000 -> ffff8881a5301000
> > >
> > > ffff8881a5301000 -> NULL
> > >
> > > which I guess has somehow become detached from its rcache->depot without
> > > being freed properly? However I'm struggling to see any conceivable way that
> > > could happen which wouldn't already be more severely broken in other ways as
> > > well (i.e. either general memory corruption or someone somehow still trying
> > > to use the IOVA domain while it's being torn down).
> >
> > The machine is running a debug kernel that among other things has KASAN
> > enabled, but there are no traces in the kernel log so there is no memory
> > corruption that I'm aware of.
> >
> > > Out of curiosity, does reverting just patch #2 alone make a difference?
> >
> > Will try and let you know.
> >
> > > And is your workload doing anything "interesting" in relation to IOVA
> > > domain lifetimes, like creating and destroying SR-IOV virtual
> > > functions, changing IOMMU domain types via sysfs, or using that
> > > horrible vdpa thing, or are you seeing this purely from regular driver
> > > DMA API usage?
> >
> > The machine is running networking related tests, but it is not using
> > SR-IOV, VMs or VDPA so there shouldn't be anything "interesting" as far
> > as IOMMU is concerned.
> >
> > The two networking drivers on the machine are "igb" for the management
> > port and "mlxsw" for the data ports (the machine is a physical switch).
> > I believe the DMA API usage in the latter is quite basic and I don't
> > recall any DMA related problems with this driver since it was first
> > accepted upstream in 2015.
>
> Thanks for the clarifications, that seems to rule out all the most
> confusingly impossible scenarios, at least.
>
> The best explanation I've managed to come up with is a false-positive race
> dependent on the order in which kmemleak scans the relevant objects. Say we
> have the list as depot -> A -> B -> C; the rcache object is scanned and sees
> the pointer to magazine A, but then A is popped *before* kmemleak scans it,
> such that when it is then scanned, its "next" pointer has already been
> wiped, thus kmemleak never observes any reference to B, so it appears that B
> and (transitively) C are "leaked".
Transient false positives are possible, especially as the code doesn't
use a double-linked list (for the latter, kmemleak does checksumming and
detects the prev/next change, defers the reporting until the object
becomes stable). That said, if a new scan is forced (echo scan >
/sys/kernel/debug/kmemleak), are the same objects still listed as leaks?
If yes, they may not be transient.
If it is indeed transient, I think a better fix than kmemleak_not_leak()
is to add a new API, something like kmemleak_mark_transient() which
resets the checksum, skips the object reporting for one scan.
--
Catalin
On Wed, Jan 10, 2024 at 05:58:15PM +0000, Catalin Marinas wrote:
> Transient false positives are possible, especially as the code doesn't
> use a double-linked list (for the latter, kmemleak does checksumming and
> detects the prev/next change, defers the reporting until the object
> becomes stable). That said, if a new scan is forced (echo scan >
> /sys/kernel/debug/kmemleak), are the same objects still listed as leaks?
> If yes, they may not be transient.
We are doing "scan" and "clear" after each test. I will disable the
"clear" and see if the leaks persist.
On Thu, Jan 11, 2024 at 10:20:27AM +0200, Ido Schimmel wrote:
> On Wed, Jan 10, 2024 at 05:58:15PM +0000, Catalin Marinas wrote:
> > Transient false positives are possible, especially as the code doesn't
> > use a double-linked list (for the latter, kmemleak does checksumming and
> > detects the prev/next change, defers the reporting until the object
> > becomes stable). That said, if a new scan is forced (echo scan >
> > /sys/kernel/debug/kmemleak), are the same objects still listed as leaks?
> > If yes, they may not be transient.
>
> We are doing "scan" and "clear" after each test. I will disable the
> "clear" and see if the leaks persist.
If it is indeed a false positive, you can try the patch below (I haven't
given it any run-time test, only compiled):
diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst
index 2cb00b53339f..7d784e03f3f9 100644
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -161,6 +161,7 @@ See the include/linux/kmemleak.h header for the functions prototype.
- ``kmemleak_free_percpu`` - notify of a percpu memory block freeing
- ``kmemleak_update_trace`` - update object allocation stack trace
- ``kmemleak_not_leak`` - mark an object as not a leak
+- ``kmemleak_transient_leak`` - mark an object as a transient leak
- ``kmemleak_ignore`` - do not scan or report an object as leak
- ``kmemleak_scan_area`` - add scan areas inside a memory block
- ``kmemleak_no_scan`` - do not scan a memory block
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d30e453d0fb4..c1d0775080ff 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -6,6 +6,7 @@
*/
#include <linux/iova.h>
+#include <linux/kmemleak.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/smp.h>
@@ -730,6 +731,11 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
{
struct iova_magazine *mag = rcache->depot;
+ /*
+ * As the mag->next pointer is moved to rcache->depot and reset via
+ * the mag->size assignment, mark the transient false positive.
+ */
+ kmemleak_transient_leak(mag->next);
rcache->depot = mag->next;
mag->size = IOVA_MAG_SIZE;
rcache->depot_size--;
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 6a3cd1bf4680..93a73c076d16 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -26,6 +26,7 @@ extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
extern void kmemleak_update_trace(const void *ptr) __ref;
extern void kmemleak_not_leak(const void *ptr) __ref;
+extern void kmemleak_transient_leak(const void *ptr) __ref;
extern void kmemleak_ignore(const void *ptr) __ref;
extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
extern void kmemleak_no_scan(const void *ptr) __ref;
@@ -93,6 +94,9 @@ static inline void kmemleak_update_trace(const void *ptr)
static inline void kmemleak_not_leak(const void *ptr)
{
}
+static inline void kmemleak_transient_leak(const void *ptr)
+{
+}
static inline void kmemleak_ignore(const void *ptr)
{
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5501363d6b31..9fd338063cea 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -915,6 +915,28 @@ static void make_black_object(unsigned long ptr, bool is_phys)
paint_ptr(ptr, KMEMLEAK_BLACK, is_phys);
}
+/*
+ * Reset the checksum of an object. The immediate effect is that it will not
+ * be reported as a leak during the next scan until its checksum is updated.
+ */
+static void reset_checksum(unsigned long ptr)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Not resetting the checksum of an unknown object at 0x%08lx\n",
+ ptr);
+ return;
+ }
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ object->checksum = 0;
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
/*
* Add a scanning area to the object. If at least one such area is added,
* kmemleak will only scan these ranges rather than the whole memory block.
@@ -1194,6 +1216,23 @@ void __ref kmemleak_not_leak(const void *ptr)
}
EXPORT_SYMBOL(kmemleak_not_leak);
+/**
+ * kmemleak_transient_leak - mark an allocated object as transient false positive
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to not be
+ * reported as a leak temporarily. This may happen, for example, if the object
+ * is part of a singly linked list and the ->next reference it is changed.
+ */
+void __ref kmemleak_transient_leak(const void *ptr)
+{
+ pr_debug("%s(0x%px)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ reset_checksum((unsigned long)ptr);
+}
+EXPORT_SYMBOL(kmemleak_transient_leak);
+
/**
* kmemleak_ignore - ignore an allocated object
* @ptr: pointer to beginning of the object
--
Catalin
On Thu, Jan 11, 2024 at 10:13:01AM +0000, Catalin Marinas wrote:
> On Thu, Jan 11, 2024 at 10:20:27AM +0200, Ido Schimmel wrote:
> > On Wed, Jan 10, 2024 at 05:58:15PM +0000, Catalin Marinas wrote:
> > > Transient false positives are possible, especially as the code doesn't
> > > use a double-linked list (for the latter, kmemleak does checksumming and
> > > detects the prev/next change, defers the reporting until the object
> > > becomes stable). That said, if a new scan is forced (echo scan >
> > > /sys/kernel/debug/kmemleak), are the same objects still listed as leaks?
> > > If yes, they may not be transient.
> >
> > We are doing "scan" and "clear" after each test. I will disable the
> > "clear" and see if the leaks persist.
>
> If it is indeed a false positive
Looks like the leaks are transient. After removing the "clear" step the
leaks do not seem to persist.
> you can try the patch below (I haven't given it any run-time test,
> only compiled):
Will try and let you know next week.
Thanks
On Fri, Jan 12, 2024 at 05:31:15PM +0200, Ido Schimmel wrote:
> On Thu, Jan 11, 2024 at 10:13:01AM +0000, Catalin Marinas wrote:
> > On Thu, Jan 11, 2024 at 10:20:27AM +0200, Ido Schimmel wrote:
> > > On Wed, Jan 10, 2024 at 05:58:15PM +0000, Catalin Marinas wrote:
> > > > Transient false positives are possible, especially as the code doesn't
> > > > use a double-linked list (for the latter, kmemleak does checksumming and
> > > > detects the prev/next change, defers the reporting until the object
> > > > becomes stable). That said, if a new scan is forced (echo scan >
> > > > /sys/kernel/debug/kmemleak), are the same objects still listed as leaks?
> > > > If yes, they may not be transient.
> > >
> > > We are doing "scan" and "clear" after each test. I will disable the
> > > "clear" and see if the leaks persist.
> >
> > If it is indeed a false positive
>
> Looks like the leaks are transient. After removing the "clear" step the
> leaks do not seem to persist.
>
> > you can try the patch below (I haven't given it any run-time test,
> > only compiled):
>
> Will try and let you know next week.
Looks good. Feel free to add:
Tested-by: Ido Schimmel <[email protected]>
Thanks!