2019-08-29 22:47:03

by Krishna Reddy

[permalink] [raw]
Subject: [PATCH 3/7] iommu/arm-smmu: Add tlb_sync implementation hook

tlb_sync hook allows nvidia smmu handle tlb sync
across multiple SMMUs as necessary.

Signed-off-by: Krishna Reddy <[email protected]>
---
drivers/iommu/arm-smmu-nvidia.c | 32 ++++++++++++++++++++++++++++++++
drivers/iommu/arm-smmu.c | 8 +++++---
drivers/iommu/arm-smmu.h | 4 ++++
3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm-smmu-nvidia.c b/drivers/iommu/arm-smmu-nvidia.c
index d93ceda..a429b2c 100644
--- a/drivers/iommu/arm-smmu-nvidia.c
+++ b/drivers/iommu/arm-smmu-nvidia.c
@@ -56,11 +56,43 @@ static void nsmmu_write_reg64(struct arm_smmu_device *smmu,
writeq_relaxed(val, nsmmu_page(smmu, i, page) + offset);
}

+static void nsmmu_tlb_sync_wait(struct arm_smmu_device *smmu, int page,
+ int sync, int status, int inst)
+{
+ u32 reg;
+ unsigned int spin_cnt, delay;
+
+ for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+ for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+ reg = readl_relaxed(
+ nsmmu_page(smmu, inst, page) + status);
+ if (!(reg & sTLBGSTATUS_GSACTIVE))
+ return;
+ cpu_relax();
+ }
+ udelay(delay);
+ }
+ dev_err_ratelimited(smmu->dev,
+ "TLB sync timed out -- SMMU may be deadlocked\n");
+}
+
+static void nsmmu_tlb_sync(struct arm_smmu_device *smmu, int page,
+ int sync, int status)
+{
+ int i;
+
+ arm_smmu_writel(smmu, page, sync, 0);
+
+ for (i = 0; i < to_nsmmu(smmu)->num_inst; i++)
+ nsmmu_tlb_sync_wait(smmu, page, sync, status, i);
+}
+
static const struct arm_smmu_impl nsmmu_impl = {
.read_reg = nsmmu_read_reg,
.write_reg = nsmmu_write_reg,
.read_reg64 = nsmmu_read_reg64,
.write_reg64 = nsmmu_write_reg64,
+ .tlb_sync = nsmmu_tlb_sync,
};

struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu)
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 46e1641..f5454e71 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -52,9 +52,6 @@
*/
#define QCOM_DUMMY_VAL -1

-#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */
-#define TLB_SPIN_COUNT 10
-
#define MSI_IOVA_BASE 0x8000000
#define MSI_IOVA_LENGTH 0x100000

@@ -244,6 +241,11 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
unsigned int spin_cnt, delay;
u32 reg;

+ if (smmu->impl->tlb_sync) {
+ smmu->impl->tlb_sync(smmu, page, sync, status);
+ return;
+ }
+
arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL);
for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
index 9645bf1..d3217f1 100644
--- a/drivers/iommu/arm-smmu.h
+++ b/drivers/iommu/arm-smmu.h
@@ -207,6 +207,8 @@ enum arm_smmu_cbar_type {
/* Maximum number of context banks per SMMU */
#define ARM_SMMU_MAX_CBS 128

+#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */
+#define TLB_SPIN_COUNT 10

/* Shared driver definitions */
enum arm_smmu_arch_version {
@@ -336,6 +338,8 @@ struct arm_smmu_impl {
int (*cfg_probe)(struct arm_smmu_device *smmu);
int (*reset)(struct arm_smmu_device *smmu);
int (*init_context)(struct arm_smmu_domain *smmu_domain);
+ void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
+ int status);
};

static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n)
--
2.1.4


2019-08-30 11:17:49

by Thierry Reding

[permalink] [raw]
Subject: Re: [PATCH 3/7] iommu/arm-smmu: Add tlb_sync implementation hook

On Thu, Aug 29, 2019 at 03:47:03PM -0700, Krishna Reddy wrote:
> tlb_sync hook allows nvidia smmu handle tlb sync
> across multiple SMMUs as necessary.
>
> Signed-off-by: Krishna Reddy <[email protected]>
> ---
> drivers/iommu/arm-smmu-nvidia.c | 32 ++++++++++++++++++++++++++++++++
> drivers/iommu/arm-smmu.c | 8 +++++---
> drivers/iommu/arm-smmu.h | 4 ++++
> 3 files changed, 41 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu-nvidia.c b/drivers/iommu/arm-smmu-nvidia.c
> index d93ceda..a429b2c 100644
> --- a/drivers/iommu/arm-smmu-nvidia.c
> +++ b/drivers/iommu/arm-smmu-nvidia.c
> @@ -56,11 +56,43 @@ static void nsmmu_write_reg64(struct arm_smmu_device *smmu,
> writeq_relaxed(val, nsmmu_page(smmu, i, page) + offset);
> }
>
> +static void nsmmu_tlb_sync_wait(struct arm_smmu_device *smmu, int page,
> + int sync, int status, int inst)
> +{
> + u32 reg;
> + unsigned int spin_cnt, delay;
> +
> + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
> + for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
> + reg = readl_relaxed(
> + nsmmu_page(smmu, inst, page) + status);
> + if (!(reg & sTLBGSTATUS_GSACTIVE))
> + return;
> + cpu_relax();
> + }
> + udelay(delay);
> + }
> + dev_err_ratelimited(smmu->dev,
> + "TLB sync timed out -- SMMU may be deadlocked\n");
> +}
> +
> +static void nsmmu_tlb_sync(struct arm_smmu_device *smmu, int page,
> + int sync, int status)
> +{
> + int i;
> +
> + arm_smmu_writel(smmu, page, sync, 0);
> +
> + for (i = 0; i < to_nsmmu(smmu)->num_inst; i++)
> + nsmmu_tlb_sync_wait(smmu, page, sync, status, i);
> +}
> +
> static const struct arm_smmu_impl nsmmu_impl = {
> .read_reg = nsmmu_read_reg,
> .write_reg = nsmmu_write_reg,
> .read_reg64 = nsmmu_read_reg64,
> .write_reg64 = nsmmu_write_reg64,
> + .tlb_sync = nsmmu_tlb_sync,
> };
>
> struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu)
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 46e1641..f5454e71 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -52,9 +52,6 @@
> */
> #define QCOM_DUMMY_VAL -1
>
> -#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */
> -#define TLB_SPIN_COUNT 10
> -
> #define MSI_IOVA_BASE 0x8000000
> #define MSI_IOVA_LENGTH 0x100000
>
> @@ -244,6 +241,11 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
> unsigned int spin_cnt, delay;
> u32 reg;
>
> + if (smmu->impl->tlb_sync) {
> + smmu->impl->tlb_sync(smmu, page, sync, status);
> + return;
> + }
> +

Wouldn't it work if you replaced all calls of __arm_smmu_tlb_sync() by
smmu->impl->tlb_sync() and assign __arm_smmu_tlb_sync() as default for
devices that don't need to override it? That makes this patch slightly
larger, but it saves us one level of indirection.

> +
> arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL);
> for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
> for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
> diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
> index 9645bf1..d3217f1 100644
> --- a/drivers/iommu/arm-smmu.h
> +++ b/drivers/iommu/arm-smmu.h
> @@ -207,6 +207,8 @@ enum arm_smmu_cbar_type {
> /* Maximum number of context banks per SMMU */
> #define ARM_SMMU_MAX_CBS 128
>
> +#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */
> +#define TLB_SPIN_COUNT 10
>
> /* Shared driver definitions */
> enum arm_smmu_arch_version {
> @@ -336,6 +338,8 @@ struct arm_smmu_impl {
> int (*cfg_probe)(struct arm_smmu_device *smmu);
> int (*reset)(struct arm_smmu_device *smmu);
> int (*init_context)(struct arm_smmu_domain *smmu_domain);
> + void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
> + int status);

Can't page, sync and status all be unsigned?

Thierry


Attachments:
(No filename) (3.88 kB)
signature.asc (849.00 B)
Download all attachments

2019-08-30 15:25:38

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH 3/7] iommu/arm-smmu: Add tlb_sync implementation hook

On 29/08/2019 23:47, Krishna Reddy wrote:
> tlb_sync hook allows nvidia smmu handle tlb sync
> across multiple SMMUs as necessary.
>
> Signed-off-by: Krishna Reddy <[email protected]>
> ---
> drivers/iommu/arm-smmu-nvidia.c | 32 ++++++++++++++++++++++++++++++++
> drivers/iommu/arm-smmu.c | 8 +++++---
> drivers/iommu/arm-smmu.h | 4 ++++
> 3 files changed, 41 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu-nvidia.c b/drivers/iommu/arm-smmu-nvidia.c
> index d93ceda..a429b2c 100644
> --- a/drivers/iommu/arm-smmu-nvidia.c
> +++ b/drivers/iommu/arm-smmu-nvidia.c
> @@ -56,11 +56,43 @@ static void nsmmu_write_reg64(struct arm_smmu_device *smmu,
> writeq_relaxed(val, nsmmu_page(smmu, i, page) + offset);
> }
>
> +static void nsmmu_tlb_sync_wait(struct arm_smmu_device *smmu, int page,
> + int sync, int status, int inst)
> +{
> + u32 reg;
> + unsigned int spin_cnt, delay;
> +
> + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
> + for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
> + reg = readl_relaxed(
> + nsmmu_page(smmu, inst, page) + status);
> + if (!(reg & sTLBGSTATUS_GSACTIVE))
> + return;
> + cpu_relax();
> + }
> + udelay(delay);
> + }
> + dev_err_ratelimited(smmu->dev,
> + "TLB sync timed out -- SMMU may be deadlocked\n");
> +}
> +
> +static void nsmmu_tlb_sync(struct arm_smmu_device *smmu, int page,
> + int sync, int status)
> +{
> + int i;
> +
> + arm_smmu_writel(smmu, page, sync, 0);
> +
> + for (i = 0; i < to_nsmmu(smmu)->num_inst; i++)

It might make more sense to make this the innermost loop, i.e.:

for (i = 0; i < nsmmu->num_inst; i++)
reg &= readl_relaxed(nsmmu_page(smmu, i, page)...

since polling the instances in parallel rather than in series seems like
it might be a bit more efficient.

> + nsmmu_tlb_sync_wait(smmu, page, sync, status, i);
> +}
> +
> static const struct arm_smmu_impl nsmmu_impl = {
> .read_reg = nsmmu_read_reg,
> .write_reg = nsmmu_write_reg,
> .read_reg64 = nsmmu_read_reg64,
> .write_reg64 = nsmmu_write_reg64,
> + .tlb_sync = nsmmu_tlb_sync,
> };
>
> struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu)
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 46e1641..f5454e71 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -52,9 +52,6 @@
> */
> #define QCOM_DUMMY_VAL -1
>
> -#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */
> -#define TLB_SPIN_COUNT 10
> -
> #define MSI_IOVA_BASE 0x8000000
> #define MSI_IOVA_LENGTH 0x100000
>
> @@ -244,6 +241,11 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu, int page,
> unsigned int spin_cnt, delay;
> u32 reg;
>
> + if (smmu->impl->tlb_sync) {
> + smmu->impl->tlb_sync(smmu, page, sync, status);

What I'd hoped is that rather than needing a hook for this, you could
just override smmu_domain->tlb_ops from .init_context to wire up the
alternate .sync method directly. That would save this extra level of
indirection.

Robin.

> + return;
> + }
> +
> arm_smmu_writel(smmu, page, sync, QCOM_DUMMY_VAL);
> for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
> for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
> diff --git a/drivers/iommu/arm-smmu.h b/drivers/iommu/arm-smmu.h
> index 9645bf1..d3217f1 100644
> --- a/drivers/iommu/arm-smmu.h
> +++ b/drivers/iommu/arm-smmu.h
> @@ -207,6 +207,8 @@ enum arm_smmu_cbar_type {
> /* Maximum number of context banks per SMMU */
> #define ARM_SMMU_MAX_CBS 128
>
> +#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */
> +#define TLB_SPIN_COUNT 10
>
> /* Shared driver definitions */
> enum arm_smmu_arch_version {
> @@ -336,6 +338,8 @@ struct arm_smmu_impl {
> int (*cfg_probe)(struct arm_smmu_device *smmu);
> int (*reset)(struct arm_smmu_device *smmu);
> int (*init_context)(struct arm_smmu_domain *smmu_domain);
> + void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
> + int status);
> };
>
> static inline void __iomem *arm_smmu_page(struct arm_smmu_device *smmu, int n)
>

2019-08-30 18:06:33

by Krishna Reddy

[permalink] [raw]
Subject: RE: [PATCH 3/7] iommu/arm-smmu: Add tlb_sync implementation hook

>> + for (i = 0; i < to_nsmmu(smmu)->num_inst; i++)

>It might make more sense to make this the innermost loop, i.e.:
for (i = 0; i < nsmmu->num_inst; i++)
reg &= readl_relaxed(nsmmu_page(smmu, i, page)...
>since polling the instances in parallel rather than in series seems like it might be a bit more efficient.

Sync register is programmed at the same time for both instances. The status check is serialized.
I can update it to check status of both at the same time.

>> + if (smmu->impl->tlb_sync) {
>> + smmu->impl->tlb_sync(smmu, page, sync, status);

>What I'd hoped is that rather than needing a hook for this, you could just override smmu_domain->tlb_ops from .init_context to wire up the alternate .sync method directly. That would save this extra level of indirection.

With arm_smmu_domain now available in arm-smmu.h, arm-smmu-nvidia.c can directly update the tlb_ops->tlb_sync and avoid indirection.
Will update in next version.

-KR

2019-08-30 19:02:32

by Krishna Reddy

[permalink] [raw]
Subject: RE: [PATCH 3/7] iommu/arm-smmu: Add tlb_sync implementation hook

>Wouldn't it work if you replaced all calls of __arm_smmu_tlb_sync() by
>smmu->impl->tlb_sync() and assign __arm_smmu_tlb_sync() as default for
>devices that don't need to override it? That makes this patch slightly larger, but it saves us one level of indirection.

The tlb_ops->tlb_sync can be overridden directly in arm-smmu-nvidia.c specific implementation as pointed by Robin. Will be updating in next patch.


>> + void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
>> + int status);

>Can't page, sync and status all be unsigned?

This is to be uniform with original tlb_sync definition is arm-smmu.c. Anyway, this hook is not necessary as tlb_ops->tlb_sync can be overridden directly.

-KR

2019-08-30 22:50:36

by Krishna Reddy

[permalink] [raw]
Subject: RE: [PATCH 3/7] iommu/arm-smmu: Add tlb_sync implementation hook

>> + if (smmu->impl->tlb_sync) {
>> + smmu->impl->tlb_sync(smmu, page, sync, status);

>What I'd hoped is that rather than needing a hook for this, you could just override smmu_domain->tlb_ops from .init_context to wire up the alternate .sync method directly. That would save this extra level of indirection.

Hi Robin, overriding tlb_ops->tlb_sync function is not enough here.
There are direct references to arm_smmu_tlb_sync_context(), arm_smmu_tlb_sync_global() functions.
In arm-smmu.c. we can replace these direct references with tlb_ops->tlb_sync() function except in one function arm_smmu_device_reset().
When arm_smmu_device_reset() gets called, domains are not initialized and tlb_ops is not available.
Should we add a new hook for arm_smmu_tlb_sync_global() or make this as a responsibility of impl->reset() hook as it gets
called at the same place?

-KR

2019-09-02 13:07:01

by Robin Murphy

[permalink] [raw]
Subject: Re: [PATCH 3/7] iommu/arm-smmu: Add tlb_sync implementation hook

On 30/08/2019 23:49, Krishna Reddy wrote:
>>> + if (smmu->impl->tlb_sync) {
>>> + smmu->impl->tlb_sync(smmu, page, sync, status);
>
>> What I'd hoped is that rather than needing a hook for this, you could just override smmu_domain->tlb_ops from .init_context to wire up the alternate .sync method directly. That would save this extra level of indirection.
>
> Hi Robin, overriding tlb_ops->tlb_sync function is not enough here.
> There are direct references to arm_smmu_tlb_sync_context(), arm_smmu_tlb_sync_global() functions.
> In arm-smmu.c. we can replace these direct references with tlb_ops->tlb_sync() function except in one function arm_smmu_device_reset().
> When arm_smmu_device_reset() gets called, domains are not initialized and tlb_ops is not available.
> Should we add a new hook for arm_smmu_tlb_sync_global() or make this as a responsibility of impl->reset() hook as it gets
> called at the same place?

Ah, right, I'd forgotten about the TLB maintenance on reset. Looking
again, though, I think you might need to implement impl->reset anyway
for the sake of clearing GFSR correctly - since the value read from the
base instance technically may not match whatever bits might happen to be
set in the other instances - so I don't see any issue with just calling
nsmmu_tlb_sync() from there as well.

Robin.