After the commit 754265bcab78 ("iommu/amd: Fix race in
increase_address_space()"), it could still possible trigger a race
condition under some heavy memory pressure below. The race to trigger a
warning is,
CPU0: CPU1:
in alloc_pte(): in increase_address_space():
while (address > PM_LEVEL_SIZE(domain->mode)) [1]
spin_lock_irqsave(&domain->lock
domain->mode += 1;
spin_unlock_irqrestore(&domain->lock
in increase_address_space():
spin_lock_irqsave(&domain->lock
if (WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
[1] domain->mode = 5
It is unclear the triggering of the warning is the root cause of the
smartpqi offline yet, but let's fix it first by lifting the locking.
WARNING: CPU: 57 PID: 124314 at drivers/iommu/amd_iommu.c:1474
iommu_map_page+0x718/0x7e0
smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
address=0xffffffffffec0000 flags=0x0010]
smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
address=0xffffffffffec1000 flags=0x0010]
CPU: 57 PID: 124314 Comm: oom01 Tainted: G O
Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40
07/10/2019
RIP: 0010:iommu_map_page+0x718/0x7e0
Code: 88 a5 70 ff ff ff e9 5d fa ff ff 48 8b b5 70 ff ff ff 4c 89 ef e8
08 32 2f 00 41 80 fc 01 0f 87 b7 3d 00 00 41 83 e4 01 eb be <0f> 0b 48
8b b5 70 ff ff ff 4c 89 ef e8 e7 31 2f 00 eb dd 0f 0b 48
RSP: 0018:ffff888da4816cb8 EFLAGS: 00010046
RAX: 0000000000000000 RBX: ffff8885fe689000 RCX: ffffffff96f4a6c4
RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8885fe689124
RBP: ffff888da4816da8 R08: ffffed10bfcd120e R09: ffffed10bfcd120e
R10: ffffed10bfcd120d R11: ffff8885fe68906b R12: 0000000000000000
smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
address=0xffffffffffec1a00 flags=0x0010]
R13: ffff8885fe689068 R14: ffff8885fe689124 R15: 0000000000000000
smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
address=0xffffffffffec1e00 flags=0x0010]
FS: 00007f29722ba700(0000) GS:ffff88902f880000(0000)
knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f27f82d8000 CR3: 000000102ed9c000 CR4: 00000000003406e0
Call Trace:
smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
address=0xffffffffffec2000 flags=0x0010]
map_sg+0x1ce/0x2f0
smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
address=0xffffffffffec2400 flags=0x0010]
scsi_dma_map+0xd7/0x160
pqi_raid_submit_scsi_cmd_with_io_request+0x1b8/0x420 [smartpqi]
smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
address=0xffffffffffec2800 flags=0x0010]
pqi_scsi_queue_command+0x8ab/0xe00 [smartpqi]
smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
address=0xffffffffffec2c00 flags=0x0010]
scsi_queue_rq+0xd19/0x1360
smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
address=0xffffffffffec3000 flags=0x0010]
__blk_mq_try_issue_directly+0x295/0x3f0
smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
address=0xffffffffffec3400 flags=0x0010]
AMD-Vi: Event logged [IO_PAGE_FAULT device=23:00.0 domain=0x0000
address=0xffffffffffec3800 flags=0x0010]
blk_mq_request_issue_directly+0xb5/0x100
AMD-Vi: Event logged [IO_PAGE_FAULT device=23:00.0 domain=0x0000
address=0xffffffffffec3c00 flags=0x0010]
blk_mq_try_issue_list_directly+0xa9/0x160
blk_mq_sched_insert_requests+0x228/0x380
blk_mq_flush_plug_list+0x448/0x7e0
blk_flush_plug_list+0x1eb/0x230
blk_finish_plug+0x43/0x5d
shrink_node_memcg+0x9c5/0x1550
smartpqi 0000:23:00.0: controller is offline: status code 0x14803
smartpqi 0000:23:00.0: controller offline
Fixes: 754265bcab78 ("iommu/amd: Fix race in increase_address_space()")
Signed-off-by: Qian Cai <[email protected]>
---
BTW, Joerg, this line from the commit "iommu/amd: Remove domain->updated" looks
suspicious. Not sure what the purpose of it.
*updated = increase_address_space(domain, gfp) || *updated;
drivers/iommu/amd_iommu.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 2369b8af81f3..a5754068aa29 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1465,12 +1465,9 @@ static void free_pagetable(struct protection_domain *domain)
static bool increase_address_space(struct protection_domain *domain,
gfp_t gfp)
{
- unsigned long flags;
bool ret = false;
u64 *pte;
- spin_lock_irqsave(&domain->lock, flags);
-
if (WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
/* address space already 64 bit large */
goto out;
@@ -1487,8 +1484,6 @@ static bool increase_address_space(struct protection_domain *domain,
ret = true;
out:
- spin_unlock_irqrestore(&domain->lock, flags);
-
return ret;
}
@@ -1499,14 +1494,19 @@ static u64 *alloc_pte(struct protection_domain *domain,
gfp_t gfp,
bool *updated)
{
+ unsigned long flags;
int level, end_lvl;
u64 *pte, *page;
BUG_ON(!is_power_of_2(page_size));
+ spin_lock_irqsave(&domain->lock, flags);
+
while (address > PM_LEVEL_SIZE(domain->mode))
*updated = increase_address_space(domain, gfp) || *updated;
+ spin_unlock_irqrestore(&domain->lock, flags);
+
level = domain->mode - 1;
pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
address = PAGE_SIZE_ALIGN(address, page_size);
--
1.8.3.1
On Wed Oct 16 19, Qian Cai wrote:
>After the commit 754265bcab78 ("iommu/amd: Fix race in
>increase_address_space()"), it could still possible trigger a race
>condition under some heavy memory pressure below. The race to trigger a
>warning is,
>
>CPU0: CPU1:
>in alloc_pte(): in increase_address_space():
>while (address > PM_LEVEL_SIZE(domain->mode)) [1]
>
> spin_lock_irqsave(&domain->lock
> domain->mode += 1;
> spin_unlock_irqrestore(&domain->lock
>
>in increase_address_space():
>spin_lock_irqsave(&domain->lock
>if (WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
>
>[1] domain->mode = 5
>
>It is unclear the triggering of the warning is the root cause of the
>smartpqi offline yet, but let's fix it first by lifting the locking.
>
>WARNING: CPU: 57 PID: 124314 at drivers/iommu/amd_iommu.c:1474
>iommu_map_page+0x718/0x7e0
>smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
>address=0xffffffffffec0000 flags=0x0010]
>smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
>address=0xffffffffffec1000 flags=0x0010]
>CPU: 57 PID: 124314 Comm: oom01 Tainted: G O
>Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40
>07/10/2019
>RIP: 0010:iommu_map_page+0x718/0x7e0
>Code: 88 a5 70 ff ff ff e9 5d fa ff ff 48 8b b5 70 ff ff ff 4c 89 ef e8
>08 32 2f 00 41 80 fc 01 0f 87 b7 3d 00 00 41 83 e4 01 eb be <0f> 0b 48
>8b b5 70 ff ff ff 4c 89 ef e8 e7 31 2f 00 eb dd 0f 0b 48
>RSP: 0018:ffff888da4816cb8 EFLAGS: 00010046
>RAX: 0000000000000000 RBX: ffff8885fe689000 RCX: ffffffff96f4a6c4
>RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8885fe689124
>RBP: ffff888da4816da8 R08: ffffed10bfcd120e R09: ffffed10bfcd120e
>R10: ffffed10bfcd120d R11: ffff8885fe68906b R12: 0000000000000000
>smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
>address=0xffffffffffec1a00 flags=0x0010]
>R13: ffff8885fe689068 R14: ffff8885fe689124 R15: 0000000000000000
>smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
>address=0xffffffffffec1e00 flags=0x0010]
>FS: 00007f29722ba700(0000) GS:ffff88902f880000(0000)
>knlGS:0000000000000000
>CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>CR2: 00007f27f82d8000 CR3: 000000102ed9c000 CR4: 00000000003406e0
>Call Trace:
>smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
>address=0xffffffffffec2000 flags=0x0010]
> map_sg+0x1ce/0x2f0
>smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
>address=0xffffffffffec2400 flags=0x0010]
> scsi_dma_map+0xd7/0x160
> pqi_raid_submit_scsi_cmd_with_io_request+0x1b8/0x420 [smartpqi]
>smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
>address=0xffffffffffec2800 flags=0x0010]
> pqi_scsi_queue_command+0x8ab/0xe00 [smartpqi]
>smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
>address=0xffffffffffec2c00 flags=0x0010]
> scsi_queue_rq+0xd19/0x1360
>smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
>address=0xffffffffffec3000 flags=0x0010]
> __blk_mq_try_issue_directly+0x295/0x3f0
>smartpqi 0000:23:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000
>address=0xffffffffffec3400 flags=0x0010]
>AMD-Vi: Event logged [IO_PAGE_FAULT device=23:00.0 domain=0x0000
>address=0xffffffffffec3800 flags=0x0010]
> blk_mq_request_issue_directly+0xb5/0x100
>AMD-Vi: Event logged [IO_PAGE_FAULT device=23:00.0 domain=0x0000
>address=0xffffffffffec3c00 flags=0x0010]
> blk_mq_try_issue_list_directly+0xa9/0x160
> blk_mq_sched_insert_requests+0x228/0x380
> blk_mq_flush_plug_list+0x448/0x7e0
> blk_flush_plug_list+0x1eb/0x230
> blk_finish_plug+0x43/0x5d
> shrink_node_memcg+0x9c5/0x1550
>smartpqi 0000:23:00.0: controller is offline: status code 0x14803
>smartpqi 0000:23:00.0: controller offline
>
>Fixes: 754265bcab78 ("iommu/amd: Fix race in increase_address_space()")
>Signed-off-by: Qian Cai <[email protected]>
>---
>
>BTW, Joerg, this line from the commit "iommu/amd: Remove domain->updated" looks
>suspicious. Not sure what the purpose of it.
>
>*updated = increase_address_space(domain, gfp) || *updated;
>
Looking at it again I think that isn't an issue really, it would just
not lose updated being set in a previous loop iteration, but now
I'm wondering about the loop itself. In the cases where it would return
false, how does the evaluation of the condition for the while loop
change?
> drivers/iommu/amd_iommu.c | 10 +++++-----
> 1 file changed, 5 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
>index 2369b8af81f3..a5754068aa29 100644
>--- a/drivers/iommu/amd_iommu.c
>+++ b/drivers/iommu/amd_iommu.c
>@@ -1465,12 +1465,9 @@ static void free_pagetable(struct protection_domain *domain)
> static bool increase_address_space(struct protection_domain *domain,
> gfp_t gfp)
> {
>- unsigned long flags;
> bool ret = false;
> u64 *pte;
>
>- spin_lock_irqsave(&domain->lock, flags);
>-
> if (WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
> /* address space already 64 bit large */
> goto out;
>@@ -1487,8 +1484,6 @@ static bool increase_address_space(struct protection_domain *domain,
> ret = true;
>
> out:
>- spin_unlock_irqrestore(&domain->lock, flags);
>-
> return ret;
> }
>
>@@ -1499,14 +1494,19 @@ static u64 *alloc_pte(struct protection_domain *domain,
> gfp_t gfp,
> bool *updated)
> {
>+ unsigned long flags;
> int level, end_lvl;
> u64 *pte, *page;
>
> BUG_ON(!is_power_of_2(page_size));
>
>+ spin_lock_irqsave(&domain->lock, flags);
>+
> while (address > PM_LEVEL_SIZE(domain->mode))
> *updated = increase_address_space(domain, gfp) || *updated;
>
>+ spin_unlock_irqrestore(&domain->lock, flags);
>+
> level = domain->mode - 1;
> pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
> address = PAGE_SIZE_ALIGN(address, page_size);
>--
>1.8.3.1
>
>_______________________________________________
>iommu mailing list
>[email protected]
>https://lists.linuxfoundation.org/mailman/listinfo/iommu
On Wed Oct 16 19, Jerry Snitselaar wrote:
>On Wed Oct 16 19, Qian Cai wrote:
>>
>>BTW, Joerg, this line from the commit "iommu/amd: Remove domain->updated" looks
>>suspicious. Not sure what the purpose of it.
>>
>>*updated = increase_address_space(domain, gfp) || *updated;
>>
>
>Looking at it again I think that isn't an issue really, it would just
>not lose updated being set in a previous loop iteration, but now
>I'm wondering about the loop itself. In the cases where it would return
>false, how does the evaluation of the condition for the while loop
>change?
>
I guess the mode level 6 check is really for other potential callers
increase_address_space, none exist at the moment, and the condition
of the while loop in alloc_pte should fail if the mode level is 6.
> On Oct 16, 2019, at 6:59 PM, Jerry Snitselaar <[email protected]> wrote:
>
> I guess the mode level 6 check is really for other potential callers
> increase_address_space, none exist at the moment, and the condition
> of the while loop in alloc_pte should fail if the mode level is 6.
Because there is no locking around iommu_map_page(), if there are several concurrent callers of it for the same domain, could it be that it silently corrupt data due to invalid access?
On Thu, Oct 17, 2019 at 07:36:51AM -0400, Qian Cai wrote:
>
>
> > On Oct 16, 2019, at 6:59 PM, Jerry Snitselaar <[email protected]> wrote:
> >
> > I guess the mode level 6 check is really for other potential callers
> > increase_address_space, none exist at the moment, and the condition
> > of the while loop in alloc_pte should fail if the mode level is 6.
>
> Because there is no locking around iommu_map_page(), if there are
> several concurrent callers of it for the same domain, could it be that
> it silently corrupt data due to invalid access?
No, that can't happen because increase_address_space locks the domain
before actually doing anything. So the address space can't grow above
domain->mode == 6. But what can happen is that the WARN_ON_ONCE triggers
in there and that the address space is increased multiple times when
only one increase would be sufficient.
To fix this we just need to check the PM_LEVEL_SIZE() condition again
when we hold the lock:
From e930e792a998e89dfd4feef15fbbf289c45124dc Mon Sep 17 00:00:00 2001
From: Joerg Roedel <[email protected]>
Date: Fri, 18 Oct 2019 11:34:22 +0200
Subject: [PATCH] iommu/amd: Check PM_LEVEL_SIZE() condition in locked section
The increase_address_space() function has to check the PM_LEVEL_SIZE()
condition again under the domain->lock to avoid a false trigger of the
WARN_ON_ONCE() and to avoid that the address space is increase more
often than necessary.
Reported-by: Qian Cai <[email protected]>
Fixes: 754265bcab78 ("iommu/amd: Fix race in increase_address_space()")
Signed-off-by: Joerg Roedel <[email protected]>
---
drivers/iommu/amd_iommu.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 2369b8af81f3..a0639e511ffe 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1463,6 +1463,7 @@ static void free_pagetable(struct protection_domain *domain)
* to 64 bits.
*/
static bool increase_address_space(struct protection_domain *domain,
+ unsigned long address,
gfp_t gfp)
{
unsigned long flags;
@@ -1471,8 +1472,8 @@ static bool increase_address_space(struct protection_domain *domain,
spin_lock_irqsave(&domain->lock, flags);
- if (WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
- /* address space already 64 bit large */
+ if (address <= PM_LEVEL_SIZE(domain->mode) ||
+ WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
goto out;
pte = (void *)get_zeroed_page(gfp);
@@ -1505,7 +1506,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
BUG_ON(!is_power_of_2(page_size));
while (address > PM_LEVEL_SIZE(domain->mode))
- *updated = increase_address_space(domain, gfp) || *updated;
+ *updated = increase_address_space(domain, address, gfp) || *updated;
level = domain->mode - 1;
pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
--
2.16.4
On Fri Oct 18 19, Joerg Roedel wrote:
>On Thu, Oct 17, 2019 at 07:36:51AM -0400, Qian Cai wrote:
>>
>>
>> > On Oct 16, 2019, at 6:59 PM, Jerry Snitselaar <[email protected]> wrote:
>> >
>> > I guess the mode level 6 check is really for other potential callers
>> > increase_address_space, none exist at the moment, and the condition
>> > of the while loop in alloc_pte should fail if the mode level is 6.
>>
>> Because there is no locking around iommu_map_page(), if there are
>> several concurrent callers of it for the same domain, could it be that
>> it silently corrupt data due to invalid access?
>
>No, that can't happen because increase_address_space locks the domain
>before actually doing anything. So the address space can't grow above
>domain->mode == 6. But what can happen is that the WARN_ON_ONCE triggers
>in there and that the address space is increased multiple times when
>only one increase would be sufficient.
>
>To fix this we just need to check the PM_LEVEL_SIZE() condition again
>when we hold the lock:
>
>From e930e792a998e89dfd4feef15fbbf289c45124dc Mon Sep 17 00:00:00 2001
>From: Joerg Roedel <[email protected]>
>Date: Fri, 18 Oct 2019 11:34:22 +0200
>Subject: [PATCH] iommu/amd: Check PM_LEVEL_SIZE() condition in locked section
>
>The increase_address_space() function has to check the PM_LEVEL_SIZE()
>condition again under the domain->lock to avoid a false trigger of the
>WARN_ON_ONCE() and to avoid that the address space is increase more
>often than necessary.
>
>Reported-by: Qian Cai <[email protected]>
>Fixes: 754265bcab78 ("iommu/amd: Fix race in increase_address_space()")
>Signed-off-by: Joerg Roedel <[email protected]>
>---
> drivers/iommu/amd_iommu.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
>diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
>index 2369b8af81f3..a0639e511ffe 100644
>--- a/drivers/iommu/amd_iommu.c
>+++ b/drivers/iommu/amd_iommu.c
>@@ -1463,6 +1463,7 @@ static void free_pagetable(struct protection_domain *domain)
> * to 64 bits.
> */
> static bool increase_address_space(struct protection_domain *domain,
>+ unsigned long address,
> gfp_t gfp)
> {
> unsigned long flags;
>@@ -1471,8 +1472,8 @@ static bool increase_address_space(struct protection_domain *domain,
>
> spin_lock_irqsave(&domain->lock, flags);
>
>- if (WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
>- /* address space already 64 bit large */
>+ if (address <= PM_LEVEL_SIZE(domain->mode) ||
>+ WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
> goto out;
>
> pte = (void *)get_zeroed_page(gfp);
>@@ -1505,7 +1506,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
> BUG_ON(!is_power_of_2(page_size));
>
> while (address > PM_LEVEL_SIZE(domain->mode))
>- *updated = increase_address_space(domain, gfp) || *updated;
>+ *updated = increase_address_space(domain, address, gfp) || *updated;
>
> level = domain->mode - 1;
> pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
>--
>2.16.4
>
Reviewed-by: Jerry Snitselaar <[email protected]>