2015-05-18 19:02:27

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH v2] radeon: Deinline indirect register accessor functions

This patch deinlines indirect register accessor functions.

These functions perform two mmio accesses, framed by spin lock/unlock.
Spin lock/unlock by itself takes more than 50 cycles in ideal case
(if lock is exclusively cached on current CPU).

With this .config: http://busybox.net/~vda/kernel_config,
after uninlining these functions have sizes and callsite counts
as follows:

r600_uvd_ctx_rreg: 111 bytes, 4 callsites
r600_uvd_ctx_wreg: 113 bytes, 5 callsites
eg_pif_phy0_rreg: 106 bytes, 13 callsites
eg_pif_phy0_wreg: 108 bytes, 13 callsites
eg_pif_phy1_rreg: 107 bytes, 13 callsites
eg_pif_phy1_wreg: 108 bytes, 13 callsites
rv370_pcie_rreg: 111 bytes, 21 callsites
rv370_pcie_wreg: 113 bytes, 24 callsites
r600_rcu_rreg: 111 bytes, 16 callsites
r600_rcu_wreg: 113 bytes, 25 callsites
cik_didt_rreg: 106 bytes, 10 callsites
cik_didt_wreg: 107 bytes, 10 callsites
tn_smc_rreg: 106 bytes, 126 callsites
tn_smc_wreg: 107 bytes, 116 callsites
eg_cg_rreg: 107 bytes, 20 callsites
eg_cg_wreg: 108 bytes, 52 callsites

Functions r100_mm_rreg() and r100_mm_rreg() have a fast path and
locked (slow) path.
This patch deinlines only slow path.

r100_mm_rreg_slow: 78 bytes, 2083 callsites
r100_mm_wreg_slow: 81 bytes, 3570 callsites

Reduction in code size is more than 65,000 bytes:

text data bss dec hex filename
85740176 22294680 20627456 128662312 7ab3b28 vmlinux.before
85674192 22294776 20627456 128598664 7aa4288 vmlinux

Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Christian König <[email protected]>
Cc: Alex Deucher <[email protected]>
Cc: [email protected]
---
Changes in v2: only partially deinline r100_mm_r/wreg

drivers/gpu/drm/radeon/r100.c | 22 ++++
drivers/gpu/drm/radeon/radeon.h | 218 ++++-----------------------------
drivers/gpu/drm/radeon/radeon_device.c | 179 +++++++++++++++++++++++++++
3 files changed, 223 insertions(+), 196 deletions(-)

diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 04f2514..238b13f 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -4090,6 +4090,28 @@ int r100_init(struct radeon_device *rdev)
return 0;
}

+uint32_t r100_mm_rreg_slow(struct radeon_device *rdev, uint32_t reg)
+{
+ unsigned long flags;
+ uint32_t ret;
+
+ spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
+ writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
+ ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
+ spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
+ return ret;
+}
+
+void r100_mm_wreg_slow(struct radeon_device *rdev, uint32_t reg, uint32_t v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
+ writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
+ writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
+ spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
+}
+
u32 r100_io_rreg(struct radeon_device *rdev, u32 reg)
{
if (reg < rdev->rio_mem_size)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 5587603..d9a7c55 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -2465,38 +2465,24 @@ int radeon_gpu_wait_for_idle(struct radeon_device *rdev);

#define RADEON_MIN_MMIO_SIZE 0x10000

+uint32_t r100_mm_rreg_slow(struct radeon_device *rdev, uint32_t reg);
+void r100_mm_wreg_slow(struct radeon_device *rdev, uint32_t reg, uint32_t v);
static inline uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
bool always_indirect)
{
/* The mmio size is 64kb at minimum. Allows the if to be optimized out. */
if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
return readl(((void __iomem *)rdev->rmmio) + reg);
- else {
- unsigned long flags;
- uint32_t ret;
-
- spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
- writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
- ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
- spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
-
- return ret;
- }
+ else
+ return r100_mm_rreg_slow(rdev, reg);
}
-
static inline void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
bool always_indirect)
{
if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
writel(v, ((void __iomem *)rdev->rmmio) + reg);
- else {
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
- writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
- writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
- spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
- }
+ else
+ r100_mm_wreg_slow(rdev, reg, v);
}

u32 r100_io_rreg(struct radeon_device *rdev, u32 reg);
@@ -2582,182 +2568,22 @@ static inline struct radeon_fence *to_radeon_fence(struct fence *f)
/*
* Indirect registers accessor
*/
-static inline uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
-{
- unsigned long flags;
- uint32_t r;
-
- spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
- WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
- r = RREG32(RADEON_PCIE_DATA);
- spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
- return r;
-}
-
-static inline void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
- WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
- WREG32(RADEON_PCIE_DATA, (v));
- spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
-}
-
-static inline u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->smc_idx_lock, flags);
- WREG32(TN_SMC_IND_INDEX_0, (reg));
- r = RREG32(TN_SMC_IND_DATA_0);
- spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
- return r;
-}
-
-static inline void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->smc_idx_lock, flags);
- WREG32(TN_SMC_IND_INDEX_0, (reg));
- WREG32(TN_SMC_IND_DATA_0, (v));
- spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
-}
-
-static inline u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
- WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
- r = RREG32(R600_RCU_DATA);
- spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
- return r;
-}
-
-static inline void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
- WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
- WREG32(R600_RCU_DATA, (v));
- spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
-}
-
-static inline u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->cg_idx_lock, flags);
- WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
- r = RREG32(EVERGREEN_CG_IND_DATA);
- spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
- return r;
-}
-
-static inline void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->cg_idx_lock, flags);
- WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
- WREG32(EVERGREEN_CG_IND_DATA, (v));
- spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
-}
-
-static inline u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->pif_idx_lock, flags);
- WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
- r = RREG32(EVERGREEN_PIF_PHY0_DATA);
- spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
- return r;
-}
-
-static inline void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->pif_idx_lock, flags);
- WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
- WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
- spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-}
-
-static inline u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->pif_idx_lock, flags);
- WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
- r = RREG32(EVERGREEN_PIF_PHY1_DATA);
- spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
- return r;
-}
-
-static inline void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->pif_idx_lock, flags);
- WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
- WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
- spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
-}
-
-static inline u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
- WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
- r = RREG32(R600_UVD_CTX_DATA);
- spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
- return r;
-}
-
-static inline void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
- WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
- WREG32(R600_UVD_CTX_DATA, (v));
- spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
-}
-
-
-static inline u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
-{
- unsigned long flags;
- u32 r;
-
- spin_lock_irqsave(&rdev->didt_idx_lock, flags);
- WREG32(CIK_DIDT_IND_INDEX, (reg));
- r = RREG32(CIK_DIDT_IND_DATA);
- spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
- return r;
-}
-
-static inline void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rdev->didt_idx_lock, flags);
- WREG32(CIK_DIDT_IND_INDEX, (reg));
- WREG32(CIK_DIDT_IND_DATA, (v));
- spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
-}
+uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg);
+void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
+u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg);
+void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg);
+void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg);
+void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg);
+void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg);
+void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg);
+void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v);
+u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg);
+void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v);

void r100_pll_errata_after_index(struct radeon_device *rdev);

diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index bd7519f..6712505 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -161,6 +161,185 @@ static void radeon_device_handle_px_quirks(struct radeon_device *rdev)
rdev->flags &= ~RADEON_IS_PX;
}

+/*
+ * Indirect registers accessor
+ */
+uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
+{
+ unsigned long flags;
+ uint32_t r;
+
+ spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
+ WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
+ r = RREG32(RADEON_PCIE_DATA);
+ spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
+ return r;
+}
+
+void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
+ WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
+ WREG32(RADEON_PCIE_DATA, (v));
+ spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
+}
+
+u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->smc_idx_lock, flags);
+ WREG32(TN_SMC_IND_INDEX_0, (reg));
+ r = RREG32(TN_SMC_IND_DATA_0);
+ spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
+ return r;
+}
+
+void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->smc_idx_lock, flags);
+ WREG32(TN_SMC_IND_INDEX_0, (reg));
+ WREG32(TN_SMC_IND_DATA_0, (v));
+ spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
+}
+
+u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
+ WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
+ r = RREG32(R600_RCU_DATA);
+ spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
+ return r;
+}
+
+void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
+ WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
+ WREG32(R600_RCU_DATA, (v));
+ spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
+}
+
+u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->cg_idx_lock, flags);
+ WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
+ r = RREG32(EVERGREEN_CG_IND_DATA);
+ spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
+ return r;
+}
+
+void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->cg_idx_lock, flags);
+ WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
+ WREG32(EVERGREEN_CG_IND_DATA, (v));
+ spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
+}
+
+u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+ WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
+ r = RREG32(EVERGREEN_PIF_PHY0_DATA);
+ spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+ return r;
+}
+
+void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+ WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
+ WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
+ spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+}
+
+u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+ WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
+ r = RREG32(EVERGREEN_PIF_PHY1_DATA);
+ spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+ return r;
+}
+
+void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->pif_idx_lock, flags);
+ WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
+ WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
+ spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
+}
+
+u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
+ WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
+ r = RREG32(R600_UVD_CTX_DATA);
+ spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
+ return r;
+}
+
+void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
+ WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
+ WREG32(R600_UVD_CTX_DATA, (v));
+ spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
+}
+
+u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
+{
+ unsigned long flags;
+ u32 r;
+
+ spin_lock_irqsave(&rdev->didt_idx_lock, flags);
+ WREG32(CIK_DIDT_IND_INDEX, (reg));
+ r = RREG32(CIK_DIDT_IND_DATA);
+ spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
+ return r;
+}
+
+void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->didt_idx_lock, flags);
+ WREG32(CIK_DIDT_IND_INDEX, (reg));
+ WREG32(CIK_DIDT_IND_DATA, (v));
+ spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
+}
+
/**
* radeon_program_register_sequence - program an array of registers.
*
--
1.8.1.4


2015-05-18 19:10:03

by Christian König

[permalink] [raw]
Subject: Re: [PATCH v2] radeon: Deinline indirect register accessor functions

On 18.05.2015 21:02, Denys Vlasenko wrote:
> This patch deinlines indirect register accessor functions.
>
> These functions perform two mmio accesses, framed by spin lock/unlock.
> Spin lock/unlock by itself takes more than 50 cycles in ideal case
> (if lock is exclusively cached on current CPU).
>
> With this .config: http://busybox.net/~vda/kernel_config,
> after uninlining these functions have sizes and callsite counts
> as follows:
>
> r600_uvd_ctx_rreg: 111 bytes, 4 callsites
> r600_uvd_ctx_wreg: 113 bytes, 5 callsites
> eg_pif_phy0_rreg: 106 bytes, 13 callsites
> eg_pif_phy0_wreg: 108 bytes, 13 callsites
> eg_pif_phy1_rreg: 107 bytes, 13 callsites
> eg_pif_phy1_wreg: 108 bytes, 13 callsites
> rv370_pcie_rreg: 111 bytes, 21 callsites
> rv370_pcie_wreg: 113 bytes, 24 callsites
> r600_rcu_rreg: 111 bytes, 16 callsites
> r600_rcu_wreg: 113 bytes, 25 callsites
> cik_didt_rreg: 106 bytes, 10 callsites
> cik_didt_wreg: 107 bytes, 10 callsites
> tn_smc_rreg: 106 bytes, 126 callsites
> tn_smc_wreg: 107 bytes, 116 callsites
> eg_cg_rreg: 107 bytes, 20 callsites
> eg_cg_wreg: 108 bytes, 52 callsites
>
> Functions r100_mm_rreg() and r100_mm_rreg() have a fast path and
> locked (slow) path.
> This patch deinlines only slow path.
>
> r100_mm_rreg_slow: 78 bytes, 2083 callsites
> r100_mm_wreg_slow: 81 bytes, 3570 callsites
>
> Reduction in code size is more than 65,000 bytes:
>
> text data bss dec hex filename
> 85740176 22294680 20627456 128662312 7ab3b28 vmlinux.before
> 85674192 22294776 20627456 128598664 7aa4288 vmlinux
>
> Signed-off-by: Denys Vlasenko <[email protected]>
> Cc: Christian König <[email protected]>
> Cc: Alex Deucher <[email protected]>
> Cc: [email protected]
> ---
> Changes in v2: only partially deinline r100_mm_r/wreg
>
> drivers/gpu/drm/radeon/r100.c | 22 ++++
> drivers/gpu/drm/radeon/radeon.h | 218 ++++-----------------------------
> drivers/gpu/drm/radeon/radeon_device.c | 179 +++++++++++++++++++++++++++

Sorry haven't noticed that before:

radeon_device.c is most likely not the right place for the non-inlined
functions. Please move them into to the appropriate files for each
generation.

As noted on the other mail as well please also CC dri-devel.

Regards,
Christian.

> 3 files changed, 223 insertions(+), 196 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> index 04f2514..238b13f 100644
> --- a/drivers/gpu/drm/radeon/r100.c
> +++ b/drivers/gpu/drm/radeon/r100.c
> @@ -4090,6 +4090,28 @@ int r100_init(struct radeon_device *rdev)
> return 0;
> }
>
> +uint32_t r100_mm_rreg_slow(struct radeon_device *rdev, uint32_t reg)
> +{
> + unsigned long flags;
> + uint32_t ret;
> +
> + spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> + writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> + ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> + spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> + return ret;
> +}
> +
> +void r100_mm_wreg_slow(struct radeon_device *rdev, uint32_t reg, uint32_t v)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> + writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> + writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> + spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> +}
> +
> u32 r100_io_rreg(struct radeon_device *rdev, u32 reg)
> {
> if (reg < rdev->rio_mem_size)
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 5587603..d9a7c55 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -2465,38 +2465,24 @@ int radeon_gpu_wait_for_idle(struct radeon_device *rdev);
>
> #define RADEON_MIN_MMIO_SIZE 0x10000
>
> +uint32_t r100_mm_rreg_slow(struct radeon_device *rdev, uint32_t reg);
> +void r100_mm_wreg_slow(struct radeon_device *rdev, uint32_t reg, uint32_t v);
> static inline uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg,
> bool always_indirect)
> {
> /* The mmio size is 64kb at minimum. Allows the if to be optimized out. */
> if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
> return readl(((void __iomem *)rdev->rmmio) + reg);
> - else {
> - unsigned long flags;
> - uint32_t ret;
> -
> - spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> - writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> - ret = readl(((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> - spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> -
> - return ret;
> - }
> + else
> + return r100_mm_rreg_slow(rdev, reg);
> }
> -
> static inline void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v,
> bool always_indirect)
> {
> if ((reg < rdev->rmmio_size || reg < RADEON_MIN_MMIO_SIZE) && !always_indirect)
> writel(v, ((void __iomem *)rdev->rmmio) + reg);
> - else {
> - unsigned long flags;
> -
> - spin_lock_irqsave(&rdev->mmio_idx_lock, flags);
> - writel(reg, ((void __iomem *)rdev->rmmio) + RADEON_MM_INDEX);
> - writel(v, ((void __iomem *)rdev->rmmio) + RADEON_MM_DATA);
> - spin_unlock_irqrestore(&rdev->mmio_idx_lock, flags);
> - }
> + else
> + r100_mm_wreg_slow(rdev, reg, v);
> }
>
> u32 r100_io_rreg(struct radeon_device *rdev, u32 reg);
> @@ -2582,182 +2568,22 @@ static inline struct radeon_fence *to_radeon_fence(struct fence *f)
> /*
> * Indirect registers accessor
> */
> -static inline uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
> -{
> - unsigned long flags;
> - uint32_t r;
> -
> - spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> - WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> - r = RREG32(RADEON_PCIE_DATA);
> - spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> - return r;
> -}
> -
> -static inline void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
> -{
> - unsigned long flags;
> -
> - spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> - WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> - WREG32(RADEON_PCIE_DATA, (v));
> - spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> -}
> -
> -static inline u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
> -{
> - unsigned long flags;
> - u32 r;
> -
> - spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> - WREG32(TN_SMC_IND_INDEX_0, (reg));
> - r = RREG32(TN_SMC_IND_DATA_0);
> - spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> - return r;
> -}
> -
> -static inline void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> - unsigned long flags;
> -
> - spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> - WREG32(TN_SMC_IND_INDEX_0, (reg));
> - WREG32(TN_SMC_IND_DATA_0, (v));
> - spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> -}
> -
> -static inline u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
> -{
> - unsigned long flags;
> - u32 r;
> -
> - spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> - WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> - r = RREG32(R600_RCU_DATA);
> - spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> - return r;
> -}
> -
> -static inline void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> - unsigned long flags;
> -
> - spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> - WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> - WREG32(R600_RCU_DATA, (v));
> - spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> -}
> -
> -static inline u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
> -{
> - unsigned long flags;
> - u32 r;
> -
> - spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> - WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> - r = RREG32(EVERGREEN_CG_IND_DATA);
> - spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> - return r;
> -}
> -
> -static inline void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> - unsigned long flags;
> -
> - spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> - WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> - WREG32(EVERGREEN_CG_IND_DATA, (v));
> - spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> -}
> -
> -static inline u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
> -{
> - unsigned long flags;
> - u32 r;
> -
> - spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> - WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> - r = RREG32(EVERGREEN_PIF_PHY0_DATA);
> - spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> - return r;
> -}
> -
> -static inline void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> - unsigned long flags;
> -
> - spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> - WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> - WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
> - spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> -}
> -
> -static inline u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
> -{
> - unsigned long flags;
> - u32 r;
> -
> - spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> - WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> - r = RREG32(EVERGREEN_PIF_PHY1_DATA);
> - spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> - return r;
> -}
> -
> -static inline void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> - unsigned long flags;
> -
> - spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> - WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> - WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
> - spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> -}
> -
> -static inline u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
> -{
> - unsigned long flags;
> - u32 r;
> -
> - spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> - WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> - r = RREG32(R600_UVD_CTX_DATA);
> - spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> - return r;
> -}
> -
> -static inline void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> - unsigned long flags;
> -
> - spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> - WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> - WREG32(R600_UVD_CTX_DATA, (v));
> - spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> -}
> -
> -
> -static inline u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
> -{
> - unsigned long flags;
> - u32 r;
> -
> - spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> - WREG32(CIK_DIDT_IND_INDEX, (reg));
> - r = RREG32(CIK_DIDT_IND_DATA);
> - spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> - return r;
> -}
> -
> -static inline void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> -{
> - unsigned long flags;
> -
> - spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> - WREG32(CIK_DIDT_IND_INDEX, (reg));
> - WREG32(CIK_DIDT_IND_DATA, (v));
> - spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> -}
> +uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg);
> +void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
> +u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg);
> +void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg);
> +void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg);
> +void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg);
> +void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg);
> +void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg);
> +void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v);
> +u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg);
> +void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v);
>
> void r100_pll_errata_after_index(struct radeon_device *rdev);
>
> diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
> index bd7519f..6712505 100644
> --- a/drivers/gpu/drm/radeon/radeon_device.c
> +++ b/drivers/gpu/drm/radeon/radeon_device.c
> @@ -161,6 +161,185 @@ static void radeon_device_handle_px_quirks(struct radeon_device *rdev)
> rdev->flags &= ~RADEON_IS_PX;
> }
>
> +/*
> + * Indirect registers accessor
> + */
> +uint32_t rv370_pcie_rreg(struct radeon_device *rdev, uint32_t reg)
> +{
> + unsigned long flags;
> + uint32_t r;
> +
> + spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> + WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> + r = RREG32(RADEON_PCIE_DATA);
> + spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> + return r;
> +}
> +
> +void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&rdev->pcie_idx_lock, flags);
> + WREG32(RADEON_PCIE_INDEX, ((reg) & rdev->pcie_reg_mask));
> + WREG32(RADEON_PCIE_DATA, (v));
> + spin_unlock_irqrestore(&rdev->pcie_idx_lock, flags);
> +}
> +
> +u32 tn_smc_rreg(struct radeon_device *rdev, u32 reg)
> +{
> + unsigned long flags;
> + u32 r;
> +
> + spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> + WREG32(TN_SMC_IND_INDEX_0, (reg));
> + r = RREG32(TN_SMC_IND_DATA_0);
> + spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> + return r;
> +}
> +
> +void tn_smc_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&rdev->smc_idx_lock, flags);
> + WREG32(TN_SMC_IND_INDEX_0, (reg));
> + WREG32(TN_SMC_IND_DATA_0, (v));
> + spin_unlock_irqrestore(&rdev->smc_idx_lock, flags);
> +}
> +
> +u32 r600_rcu_rreg(struct radeon_device *rdev, u32 reg)
> +{
> + unsigned long flags;
> + u32 r;
> +
> + spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> + WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> + r = RREG32(R600_RCU_DATA);
> + spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> + return r;
> +}
> +
> +void r600_rcu_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&rdev->rcu_idx_lock, flags);
> + WREG32(R600_RCU_INDEX, ((reg) & 0x1fff));
> + WREG32(R600_RCU_DATA, (v));
> + spin_unlock_irqrestore(&rdev->rcu_idx_lock, flags);
> +}
> +
> +u32 eg_cg_rreg(struct radeon_device *rdev, u32 reg)
> +{
> + unsigned long flags;
> + u32 r;
> +
> + spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> + WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> + r = RREG32(EVERGREEN_CG_IND_DATA);
> + spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> + return r;
> +}
> +
> +void eg_cg_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&rdev->cg_idx_lock, flags);
> + WREG32(EVERGREEN_CG_IND_ADDR, ((reg) & 0xffff));
> + WREG32(EVERGREEN_CG_IND_DATA, (v));
> + spin_unlock_irqrestore(&rdev->cg_idx_lock, flags);
> +}
> +
> +u32 eg_pif_phy0_rreg(struct radeon_device *rdev, u32 reg)
> +{
> + unsigned long flags;
> + u32 r;
> +
> + spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> + WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> + r = RREG32(EVERGREEN_PIF_PHY0_DATA);
> + spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> + return r;
> +}
> +
> +void eg_pif_phy0_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> + WREG32(EVERGREEN_PIF_PHY0_INDEX, ((reg) & 0xffff));
> + WREG32(EVERGREEN_PIF_PHY0_DATA, (v));
> + spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> +}
> +
> +u32 eg_pif_phy1_rreg(struct radeon_device *rdev, u32 reg)
> +{
> + unsigned long flags;
> + u32 r;
> +
> + spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> + WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> + r = RREG32(EVERGREEN_PIF_PHY1_DATA);
> + spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> + return r;
> +}
> +
> +void eg_pif_phy1_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&rdev->pif_idx_lock, flags);
> + WREG32(EVERGREEN_PIF_PHY1_INDEX, ((reg) & 0xffff));
> + WREG32(EVERGREEN_PIF_PHY1_DATA, (v));
> + spin_unlock_irqrestore(&rdev->pif_idx_lock, flags);
> +}
> +
> +u32 r600_uvd_ctx_rreg(struct radeon_device *rdev, u32 reg)
> +{
> + unsigned long flags;
> + u32 r;
> +
> + spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> + WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> + r = RREG32(R600_UVD_CTX_DATA);
> + spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> + return r;
> +}
> +
> +void r600_uvd_ctx_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&rdev->uvd_idx_lock, flags);
> + WREG32(R600_UVD_CTX_INDEX, ((reg) & 0x1ff));
> + WREG32(R600_UVD_CTX_DATA, (v));
> + spin_unlock_irqrestore(&rdev->uvd_idx_lock, flags);
> +}
> +
> +u32 cik_didt_rreg(struct radeon_device *rdev, u32 reg)
> +{
> + unsigned long flags;
> + u32 r;
> +
> + spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> + WREG32(CIK_DIDT_IND_INDEX, (reg));
> + r = RREG32(CIK_DIDT_IND_DATA);
> + spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> + return r;
> +}
> +
> +void cik_didt_wreg(struct radeon_device *rdev, u32 reg, u32 v)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&rdev->didt_idx_lock, flags);
> + WREG32(CIK_DIDT_IND_INDEX, (reg));
> + WREG32(CIK_DIDT_IND_DATA, (v));
> + spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);
> +}
> +
> /**
> * radeon_program_register_sequence - program an array of registers.
> *

2015-05-18 22:50:33

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH v2] radeon: Deinline indirect register accessor functions

On Mon, May 18, 2015 at 9:09 PM, Christian König
<[email protected]> wrote:
>> r600_uvd_ctx_rreg: 111 bytes, 4 callsites
>> r600_uvd_ctx_wreg: 113 bytes, 5 callsites
>> eg_pif_phy0_rreg: 106 bytes, 13 callsites
>> eg_pif_phy0_wreg: 108 bytes, 13 callsites
>> eg_pif_phy1_rreg: 107 bytes, 13 callsites
>> eg_pif_phy1_wreg: 108 bytes, 13 callsites
>> rv370_pcie_rreg: 111 bytes, 21 callsites
>> rv370_pcie_wreg: 113 bytes, 24 callsites
>> r600_rcu_rreg: 111 bytes, 16 callsites
>> r600_rcu_wreg: 113 bytes, 25 callsites
>> cik_didt_rreg: 106 bytes, 10 callsites
>> cik_didt_wreg: 107 bytes, 10 callsites
>> tn_smc_rreg: 106 bytes, 126 callsites
>> tn_smc_wreg: 107 bytes, 116 callsites
>> eg_cg_rreg: 107 bytes, 20 callsites
>> eg_cg_wreg: 108 bytes, 52 callsites

> Sorry haven't noticed that before:
>
> radeon_device.c is most likely not the right place for the non-inlined
> functions. Please move them into to the appropriate files for each
> generation.

Will do (probably tomorrow, not today).

Can you help me here a bit?
There are LOTS of *.c files in drm/radeon/.
I guess r600_ functions should go into r600.c,
rv370_ to rv730_dpm.c (right?),
but some of the function names are less clear (to me).

Where would you like eg_pif_phyN_r/wreg() go? evergreen.c?
Should eg_cg_r/wreg() also go to this file?

cik_didt_r/wreg() - to cik.c?

tn_smc_r/wreg()? Is tn = trinity? so, trinity_smc.c?

2015-05-19 00:39:18

by Deucher, Alexander

[permalink] [raw]
Subject: RE: [PATCH v2] radeon: Deinline indirect register accessor functions

> -----Original Message-----
> From: Denys Vlasenko [mailto:[email protected]]
> Sent: Monday, May 18, 2015 6:50 PM
> To: Koenig, Christian
> Cc: Denys Vlasenko; Deucher, Alexander; Linux Kernel Mailing List
> Subject: Re: [PATCH v2] radeon: Deinline indirect register accessor functions
>
> On Mon, May 18, 2015 at 9:09 PM, Christian König
> <[email protected]> wrote:
> >> r600_uvd_ctx_rreg: 111 bytes, 4 callsites
> >> r600_uvd_ctx_wreg: 113 bytes, 5 callsites
> >> eg_pif_phy0_rreg: 106 bytes, 13 callsites
> >> eg_pif_phy0_wreg: 108 bytes, 13 callsites
> >> eg_pif_phy1_rreg: 107 bytes, 13 callsites
> >> eg_pif_phy1_wreg: 108 bytes, 13 callsites
> >> rv370_pcie_rreg: 111 bytes, 21 callsites
> >> rv370_pcie_wreg: 113 bytes, 24 callsites
> >> r600_rcu_rreg: 111 bytes, 16 callsites
> >> r600_rcu_wreg: 113 bytes, 25 callsites
> >> cik_didt_rreg: 106 bytes, 10 callsites
> >> cik_didt_wreg: 107 bytes, 10 callsites
> >> tn_smc_rreg: 106 bytes, 126 callsites
> >> tn_smc_wreg: 107 bytes, 116 callsites
> >> eg_cg_rreg: 107 bytes, 20 callsites
> >> eg_cg_wreg: 108 bytes, 52 callsites
>
> > Sorry haven't noticed that before:
> >
> > radeon_device.c is most likely not the right place for the non-inlined
> > functions. Please move them into to the appropriate files for each
> > generation.
>
> Will do (probably tomorrow, not today).

Is this whole exercise really worthwhile? This will be the 3rd or 4th time these have been inlined/uninlined.

>
> Can you help me here a bit?
> There are LOTS of *.c files in drm/radeon/.
> I guess r600_ functions should go into r600.c,

Yes.

> rv370_ to rv730_dpm.c (right?),

No. rv370_ should go in r300.c

> but some of the function names are less clear (to me).
>
> Where would you like eg_pif_phyN_r/wreg() go? evergreen.c?

Yes.

> Should eg_cg_r/wreg() also go to this file?

Yes.

>
> cik_didt_r/wreg() - to cik.c?

Yes.

>
> tn_smc_r/wreg()? Is tn = trinity? so, trinity_smc.c?

ni.c

Alex

????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2015-05-20 10:47:50

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH v2] radeon: Deinline indirect register accessor functions

On 05/19/2015 01:06 AM, Deucher, Alexander wrote:
>> -----Original Message-----
>> From: Denys Vlasenko [mailto:[email protected]]
>> Sent: Monday, May 18, 2015 6:50 PM
>> To: Koenig, Christian
>> Cc: Denys Vlasenko; Deucher, Alexander; Linux Kernel Mailing List
>> Subject: Re: [PATCH v2] radeon: Deinline indirect register accessor functions
>>
>> On Mon, May 18, 2015 at 9:09 PM, Christian König
>> <[email protected]> wrote:
>>>> r600_uvd_ctx_rreg: 111 bytes, 4 callsites
>>>> r600_uvd_ctx_wreg: 113 bytes, 5 callsites
>>>> eg_pif_phy0_rreg: 106 bytes, 13 callsites
>>>> eg_pif_phy0_wreg: 108 bytes, 13 callsites
>>>> eg_pif_phy1_rreg: 107 bytes, 13 callsites
>>>> eg_pif_phy1_wreg: 108 bytes, 13 callsites
>>>> rv370_pcie_rreg: 111 bytes, 21 callsites
>>>> rv370_pcie_wreg: 113 bytes, 24 callsites
>>>> r600_rcu_rreg: 111 bytes, 16 callsites
>>>> r600_rcu_wreg: 113 bytes, 25 callsites
>>>> cik_didt_rreg: 106 bytes, 10 callsites
>>>> cik_didt_wreg: 107 bytes, 10 callsites
>>>> tn_smc_rreg: 106 bytes, 126 callsites
>>>> tn_smc_wreg: 107 bytes, 116 callsites
>>>> eg_cg_rreg: 107 bytes, 20 callsites
>>>> eg_cg_wreg: 108 bytes, 52 callsites
>>
>>> Sorry haven't noticed that before:
>>>
>>> radeon_device.c is most likely not the right place for the non-inlined
>>> functions. Please move them into to the appropriate files for each
>>> generation.
>>
>> Will do (probably tomorrow, not today).
>
> Is this whole exercise really worthwhile?
> This will be the 3rd or 4th time these have been inlined/uninlined.

When code grows by 65000 bytes, there ought to be a good reason to inline.
I don't see it.

Let's take a look what these functions actually do. cik_didt_wreg is():

spin_lock_irqsave(&rdev->didt_idx_lock, flags);
WREG32(CIK_DIDT_IND_INDEX, (reg));
WREG32(CIK_DIDT_IND_DATA, (v));
spin_unlock_irqrestore(&rdev->didt_idx_lock, flags);

this compiles to (on defconfig + radeon enabled):

55 push %rbp
48 89 e5 mov %rsp,%rbp
48 83 ec 20 sub $0x20,%rsp
4c 89 65 e8 mov %r12,-0x18(%rbp)
4c 8d a7 cc 01 00 00 lea 0x1cc(%rdi),%r12
48 89 5d e0 mov %rbx,-0x20(%rbp)
48 89 fb mov %rdi,%rbx
4c 89 6d f0 mov %r13,-0x10(%rbp)
4c 89 75 f8 mov %r14,-0x8(%rbp)
4c 89 e7 mov %r12,%rdi
41 89 d6 mov %edx,%r14d
41 89 f5 mov %esi,%r13d
e8 20 6b 4d 00 callq <_raw_spin_lock_irqsave> //spin_lock_irqsave
48 8b 93 d0 01 00 00 mov 0x1d0(%rbx),%rdx
44 89 aa 00 ca 00 00 mov %r13d,0xca00(%rdx) //WREG32
48 8b 93 d0 01 00 00 mov 0x1d0(%rbx),%rdx
44 89 b2 04 ca 00 00 mov %r14d,0xca04(%rdx) //WREG32
4c 89 e7 mov %r12,%rdi
48 89 c6 mov %rax,%rsi
e8 b9 69 4d 00 callq <_raw_spin_unlock_irqrestore> //spin_unlock_irqrestore
48 8b 5d e0 mov -0x20(%rbp),%rbx
4c 8b 65 e8 mov -0x18(%rbp),%r12
4c 8b 6d f0 mov -0x10(%rbp),%r13
4c 8b 75 f8 mov -0x8(%rbp),%r14
c9 leaveq
c3 retq

<_raw_spin_lock_irqsave>:
55 push %rbp
48 89 e5 mov %rsp,%rbp
9c pushfq
58 pop %rax
fa cli
ba 00 01 00 00 mov $0x100,%edx
f0 66 0f c1 17 lock xadd %dx,(%rdi) // expensive
0f b6 ce movzbl %dh,%ecx
38 d1 cmp %dl,%cl
75 04 jne <_raw_spin_lock_irqsave+0x1c>
5d pop %rbp
c3 retq
f3 90 pause
0f b6 17 movzbl (%rdi),%edx
38 ca cmp %cl,%dl
75 f7 jne <_raw_spin_lock_irqsave+0x1a>
5d pop %rbp
c3 retq

<_raw_spin_unlock_irqrestore>:
55 push %rbp
48 89 e5 mov %rsp,%rbp
80 07 01 addb $0x1,(%rdi)
56 push %rsi
9d popfq //expensive
5d pop %rbp
c3 retq

Now, using attached test program, I measure how long
call+ret pair takes:

# ./timing_test64 callret
400000000 loops in 0.71467s = 1.79 nsec/loop for callret

Unlocked read-modify-write memory operation:

# ./timing_test64 or
400000000 loops in 0.86119s = 2.15 nsec/loop for or

Locked read-modify-write memory operations:

# ./timing_test64 lock_or
100000000 loops in 0.68902s = 6.89 nsec/loop for lock_or
# ./timing_test64 lock_xadd
100000000 loops in 0.68582s = 6.86 nsec/loop for lock_xadd

And POPF:

# ./timing_test64 popf
100000000 loops in 0.68861s = 6.89 nsec/loop for popf

This is on Sandy Bridge CPU with cycle time of about 0.30 ns:

# ./timing_test64 nothing
2000000000 loops in 0.59716s = 0.30 nsec/loop for nothing


So, what do we see?

call+ret takes 5 cycles. This is cheaper that one unlocked
RMW memory operation which is 7 cycles.

Locked RMW is 21 cycles in the ideal case (this is what
spin_lock_irqsave does). POPF is also 21 cycles
(spin_unlock_irqrestore does this). Add to this two mmio
accesses (easily 50s of cycles) and all other necessary operations
visible in the assembly code - 5 memory stores,
7 memory loads, and two call+ret pairs.

I expect overhead of call+ret added by deinlining to be in 1-4%,
if you run a microbenchmark which does nothing but one of these ops.
--
vda


Attachments:
timing_test.c (8.07 kB)